From 6b6c4cbe0b3770ba7ce398500698612759f5b12e Mon Sep 17 00:00:00 2001 From: Yurii Romanyshyn <71635444+yromanyshyn@users.noreply.github.com> Date: Thu, 11 May 2023 09:53:14 +0300 Subject: [PATCH 01/20] [DEE-456] nlp dummy model refactoring (#2511) * nlp dummy model refactoring --- deepchecks/nlp/context.py | 255 ++++++------------ deepchecks/nlp/input_validations.py | 219 ++++++++++++++- deepchecks/utils/typing.py | 4 +- .../under_annotated_segments_test.py | 2 +- .../model_evaluation/confusion_matrix_test.py | 6 +- .../single_dataset_performance_test.py | 12 +- tests/nlp/conftest.py | 2 +- tests/nlp/test_context.py | 250 +++++++++++------ tests/nlp/utils/test_embeddings.py | 1 + .../confusion_matrix_report_test.py | 2 +- tests/utils/metrics_test.py | 2 +- 11 files changed, 482 insertions(+), 273 deletions(-) diff --git a/deepchecks/nlp/context.py b/deepchecks/nlp/context.py index 9068d22eff..09f954f863 100644 --- a/deepchecks/nlp/context.py +++ b/deepchecks/nlp/context.py @@ -9,7 +9,6 @@ # ---------------------------------------------------------------------------- # """Module for base nlp context.""" -import collections import typing as t from operator import itemgetter @@ -17,8 +16,9 @@ from deepchecks.core.context import BaseContext from deepchecks.core.errors import (DatasetValidationError, DeepchecksNotSupportedError, DeepchecksValueError, - ModelValidationError, ValidationError) -from deepchecks.nlp.input_validations import compare_dataframes + ModelValidationError) +from deepchecks.nlp.input_validations import (_validate_multilabel, _validate_text_classification, + _validate_token_classification, compare_dataframes) from deepchecks.nlp.metric_utils.scorers import init_validate_scorers from deepchecks.nlp.metric_utils.token_classification import (get_default_token_scorers, get_scorer_dict, validate_scorers) @@ -27,10 +27,10 @@ from deepchecks.nlp.utils.data_inference import infer_observed_and_model_labels from deepchecks.tabular.metric_utils import DeepcheckScorer, get_default_scorers from deepchecks.tabular.utils.task_type import TaskType as TabularTaskType -from deepchecks.tabular.utils.validation import ensure_predictions_proba, ensure_predictions_shape from deepchecks.utils.docref import doclink from deepchecks.utils.logger import get_logger from deepchecks.utils.typing import BasicModel +from deepchecks.utils.validation import is_sequence_not_str __all__ = [ 'Context', @@ -39,13 +39,19 @@ 'TTokenPred' ] -from deepchecks.utils.validation import is_sequence_not_str -TClassPred = t.Union[t.Sequence[t.Union[str, int]], t.Sequence[t.Sequence[t.Union[str, int]]]] -TClassProba = t.Sequence[t.Sequence[float]] -TTokenPred = t.Sequence[t.Sequence[t.Tuple[str, int, int, float]]] +TClassPred = t.Union[ + t.Sequence[int], + t.Sequence[str], + t.Sequence[t.Sequence[int]] +] +TTokenPred = t.Union[ + t.Sequence[t.Sequence[int]], + t.Sequence[t.Sequence[str]], +] + TTextPred = t.Union[TClassPred, TTokenPred] -TTextProba = t.Union[TClassProba] # TODO: incorrect, why union have only one type argument? +TTextProba = t.Sequence[t.Sequence[float]] class _DummyModel(BasicModel): @@ -72,68 +78,81 @@ class _DummyModel(BasicModel): predictions: t.Dict[str, t.Dict[int, TTextPred]] proba: t.Dict[str, t.Dict[int, TTextProba]] - def __init__(self, - test: TextData, - y_pred_test: TTextPred, - y_proba_test: TTextProba, - train: t.Union[TextData, None] = None, - y_pred_train: TTextPred = None, - y_proba_train: TTextProba = None, - model_classes: list = None, - validate_data_on_predict: bool = True): + def __init__( + self, + *, + test: TextData, + y_pred_test: TTextPred, + y_proba_test: TTextProba, + model_classes: t.List[t.Any], + train: t.Optional[TextData] = None, + y_pred_train: t.Optional[TTextPred] = None, + y_proba_train: t.Optional[TTextProba] = None, + validate_data_on_predict: bool = True, + multilabel_proba_threshold: float = 0.5 + ): """Initialize dummy model.""" predictions = {} probas = {} - if ((y_proba_train is not None) or (y_proba_test is not None)) and \ - (train.task_type == TaskType.TOKEN_CLASSIFICATION): - raise DeepchecksNotSupportedError('For token classification probabilities are not supported') - if train is not None and test is not None: # check if datasets have same indexes - if set(train.get_original_text_indexes()) & set(test.get_original_text_indexes()): - train._original_text_index = np.asarray([f'train-{i}' for i in train.get_original_text_indexes()]) - test._original_text_index = np.asarray([f'test-{i}' for i in test.get_original_text_indexes()]) + train_index = train.get_original_text_indexes() + test_index = test.get_original_text_indexes() + if set(train_index) & set(test_index): + train._original_text_index = np.asarray([f'train-{i}' for i in train_index]) + test._original_text_index = np.asarray([f'test-{i}' for i in test_index]) # # This is commented out as currently text data indices are len(range(len(data))) # # TODO: Uncomment when text data indices are not len(range(len(data))) # get_logger().warning('train and test datasets have common index - adding "train"/"test"' # ' prefixes. To avoid that provide datasets with no common indexes ' # 'or pass the model object instead of the predictions.') - for dataset, y_pred, y_proba in zip([train, test], - [y_pred_train, y_pred_test], - [y_proba_train, y_proba_test]): - if dataset is not None: - if y_pred is not None: - self._validate_prediction(dataset, y_pred, len(model_classes)) - if y_proba is not None: - self._validate_proba(dataset, y_proba, len(model_classes)) - - if dataset.task_type == TaskType.TEXT_CLASSIFICATION: - if (y_pred is None) and (y_proba is not None): - if dataset.is_multi_label_classification(): - y_pred = (np.array(y_proba) > 0.5) # TODO: Replace with user-configurable threshold - else: - y_pred = np.argmax(np.array(y_proba), axis=-1) - y_pred = np.array(model_classes, dtype='str')[y_pred] - - if y_pred is not None: - if dataset.is_multi_label_classification(): - y_pred = np.array(y_pred) - else: - y_pred = np.array(y_pred, dtype='str') - if len(y_pred.shape) > 1 and y_pred.shape[1] == 1: - y_pred = y_pred[:, 0] - ensure_predictions_shape(y_pred, dataset.text) - - if y_proba is not None: - ensure_predictions_proba(y_proba, y_pred) - y_proba_dict = dict(zip(dataset.get_original_text_indexes(), y_proba)) - probas.update({dataset.name: y_proba_dict}) - - if y_pred is not None: - y_pred_dict = dict(zip(dataset.get_original_text_indexes(), y_pred)) - predictions.update({dataset.name: y_pred_dict}) + for dataset, y_pred, y_proba in ( + (train, y_pred_train, y_proba_train), + (test, y_pred_test, y_proba_test), + ): + if dataset is None: + continue + + if dataset.is_multi_label_classification(): + y_pred, y_proba = _validate_multilabel( + dataset=dataset, + predictions=y_pred, + probabilities=y_proba, + n_of_classes=len(model_classes) + ) + if y_pred is None and y_proba is not None: + y_pred = (np.array(y_proba) > multilabel_proba_threshold) + y_pred = y_pred.astype(int) + + elif dataset.task_type is TaskType.TEXT_CLASSIFICATION: + y_pred, y_proba = _validate_text_classification( + dataset=dataset, + predictions=y_pred, + probabilities=y_proba, + n_of_classes=len(model_classes) + ) + if y_pred is None and y_proba is not None: + y_pred = np.argmax(np.array(y_proba), axis=-1) + y_pred = np.array(model_classes, dtype='str')[y_pred] + + elif dataset.task_type is TaskType.TOKEN_CLASSIFICATION: + _validate_token_classification( + dataset=dataset, + predictions=y_pred, + probabilities=y_proba, + ) + + else: + raise ValueError(f'Unknown task type - {type(dataset.task_type)}') + + if y_pred is not None: + y_pred_dict = dict(zip(dataset.get_original_text_indexes(), y_pred)) + predictions.update({dataset.name: y_pred_dict}) + if y_proba is not None: + y_proba_dict = dict(zip(dataset.get_original_text_indexes(), y_proba)) + probas.update({dataset.name: y_proba_dict}) self.predictions = predictions self.probas = probas @@ -142,13 +161,16 @@ def __init__(self, if self.predictions: self.predict = self._predict - self._prediction_indices = \ - {name: set(data_preds.keys()) for name, data_preds in self.predictions.items()} - + self._prediction_indices = { + name: set(data_preds.keys()) + for name, data_preds in self.predictions.items() + } if self.probas: self.predict_proba = self._predict_proba - self._proba_indices = \ - {name: set(data_proba.keys()) for name, data_proba in self.probas.items()} + self._proba_indices = { + name: set(data_proba.keys()) + for name, data_proba in self.probas.items() + } def _predict(self, data: TextData) -> TTextPred: # TODO: Needs to receive list of strings, not TextData """Predict on given data by the data indexes.""" @@ -174,111 +196,6 @@ def fit(self, *args, **kwargs): """Just for python 3.6 (sklearn validates fit method).""" pass - @staticmethod - def _validate_prediction(dataset: TextData, prediction: TTextPred, n_classes: int): - """Validate prediction for given dataset.""" - if not (is_sequence_not_str(prediction) - or (isinstance(prediction, np.ndarray) and prediction.ndim == 1)): - raise ValidationError(f'Check requires predictions for {dataset.name} to be a sequence') - if len(prediction) != dataset.n_samples: - raise ValidationError(f'Check requires predictions for {dataset.name} to have ' - f'{dataset.n_samples} rows, same as dataset') - - if dataset.task_type == TaskType.TEXT_CLASSIFICATION: - _DummyModel._validate_classification_prediction(dataset, prediction, n_classes) - elif dataset.task_type == TaskType.TOKEN_CLASSIFICATION: - _DummyModel._validate_token_classification_prediction(dataset, prediction) - - @staticmethod - def _validate_classification_prediction(dataset: TextData, prediction: TTextPred, n_classes: int): - """Validate prediction for given text classification dataset.""" - classification_format_error = f'Check requires classification predictions for {dataset.name} to be ' \ - f'either a sequence that can be cast to a 1D numpy array of shape' \ - f' (n_samples,), or a sequence of sequences that can be cast to a 2D ' \ - f'numpy array of shape (n_samples, n_classes) for the multilabel case.' - - try: - prediction = np.array(prediction) - if dataset.is_multi_label_classification(): - prediction = prediction.astype(float) # Multilabel prediction is a binary matrix - else: - prediction = prediction.reshape((-1, 1)) # Multiclass (not multilabel) Prediction can be a string - if prediction.shape[0] != dataset.n_samples: - raise ValidationError(classification_format_error) - except ValueError as e: - raise ValidationError(classification_format_error) from e - pred_shape = prediction.shape - if dataset.is_multi_label_classification(): - if len(pred_shape) == 1 or pred_shape[1] != n_classes: - raise ValidationError(classification_format_error) - if not np.array_equal(prediction, prediction.astype(bool)): - raise ValidationError(f'Check requires classification predictions for {dataset.name} dataset ' - f'to be either 0 or 1') - - @staticmethod - def _validate_token_classification_prediction(dataset: TextData, prediction: TTextPred): - """Validate prediction for given token classification dataset.""" - if not is_sequence_not_str(prediction): - raise ValidationError( - f'Check requires predictions for {dataset.name} to be a sequence of sequences' - ) - - tokenized_text = dataset.tokenized_text - - for idx, sample_predictions in enumerate(prediction): - if not is_sequence_not_str(sample_predictions): - raise ValidationError( - f'Check requires predictions for {dataset.name} to be a sequence of sequences' - ) - - predictions_types_counter = collections.defaultdict(int) - - for p in sample_predictions: - predictions_types_counter[type(p)] += 1 - - if predictions_types_counter[str] > 0 and predictions_types_counter[int] > 0: - raise ValidationError( - f'Check requires predictions for {dataset.name} to be a sequence ' - 'of sequences of strings or integers' - ) - if len(sample_predictions) != len(tokenized_text[idx]): - raise ValidationError( - f'Check requires predictions for {dataset.name} to have ' - 'the same number of tokens as the input text' - ) - - @staticmethod - def _validate_proba(dataset: TextData, probabilities: TTextProba, n_classes: int, - eps: float = 1e-3): - """Validate predicted probabilities for given dataset.""" - classification_format_error = f'Check requires classification probabilities for {dataset.name} to be a ' \ - f'sequence of sequences that can be cast to a 2D numpy array of shape' \ - f' (n_samples, n_classes)' - - if len(probabilities) != dataset.n_samples: - raise ValidationError(f'Check requires classification probabilities for {dataset.name} dataset ' - f'to have {dataset.n_samples} rows, same as dataset') - - if dataset.task_type == TaskType.TEXT_CLASSIFICATION: - try: - probabilities = np.array(probabilities, dtype='float') - except ValueError as e: - raise ValidationError(classification_format_error) from e - proba_shape = probabilities.shape - if len(proba_shape) != 2: - raise ValidationError(classification_format_error) - if proba_shape[1] != n_classes: - raise ValidationError(f'Check requires classification probabilities for {dataset.name} dataset ' - f'to have {n_classes} columns, same as the number of classes') - if dataset.is_multi_label_classification(): - if (probabilities > 1).any() or (probabilities < 0).any(): - raise ValidationError(f'Check requires classification probabilities for {dataset.name} ' - f'dataset to be between 0 and 1') - else: - if any(abs(probabilities.sum(axis=1) - 1) > eps): - raise ValidationError(f'Check requires classification probabilities for {dataset.name} ' - f'dataset to be probabilities and sum to 1 for each row') - class Context(BaseContext): """Contains all the data + properties the user has passed to a check/suite, and validates it seamlessly. diff --git a/deepchecks/nlp/input_validations.py b/deepchecks/nlp/input_validations.py index e3ef26250a..e79a2b521c 100644 --- a/deepchecks/nlp/input_validations.py +++ b/deepchecks/nlp/input_validations.py @@ -9,18 +9,22 @@ # ---------------------------------------------------------------------------- # """Module containing input validation functions.""" -from typing import Dict, List, NamedTuple, Optional, Sequence, Set, Tuple, cast +import collections +from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Set, Tuple, Type, cast import numpy as np import pandas as pd -from deepchecks.core.errors import DeepchecksValueError +from deepchecks.core.errors import DeepchecksValueError, ValidationError from deepchecks.nlp.task_type import TaskType, TTextLabel from deepchecks.utils.logger import get_logger from deepchecks.utils.metrics import is_label_none from deepchecks.utils.type_inference import infer_categorical_features from deepchecks.utils.validation import is_sequence_not_str +if TYPE_CHECKING: + from deepchecks.nlp.text_data import TextData + def validate_tokenized_text(tokenized_text: Optional[Sequence[Sequence[str]]]): """Validate tokenized text format.""" @@ -241,3 +245,214 @@ def compare_dataframes( difference = None return DataframesComparison(common, difference) + + +def _validate_text_classification( + *, + dataset: 'TextData', + predictions: Any = None, + probabilities: Any = None, + n_of_classes: Optional[int] = None, + eps: float = 1e-3 +) -> Tuple[ + Optional[np.ndarray], # predictions + Optional[np.ndarray], # probabilities +]: + if predictions is not None: + format_error_message = ( + f'Check requires predictions for the "{dataset.name}" dataset ' + 'to be of a type sequence[str] | sequence[int]' + ) + if not is_sequence_not_str(predictions): + raise ValidationError(format_error_message) + if len(predictions) != dataset.n_samples: + raise ValidationError( + f'Check requires predictions for the "{dataset.name}" dataset ' + f'to have {dataset.n_samples} rows, same as dataset' + ) + try: + predictions = np.array(predictions, dtype='object') + except ValueError as e: + raise ValidationError( + 'Failed to cast predictions to a numpy array. ' + f'{format_error_message}' + ) from e + else: + if predictions.ndim == 2 and predictions.shape[1] == 1: + predictions = predictions[:, 0] + if predictions.ndim != 1: + raise ValidationError(format_error_message) + + predictions = np.array([ + str(it) if it is not None else None + for it in predictions + ], dtype='object') + + if probabilities is not None: + format_error_message = ( + f'Check requires classification probabilities for the "{dataset.name}" ' + 'dataset to be of a type sequence[sequence[float]] that can be cast to ' + 'a 2D numpy array of shape (n_samples, n_classes)' + ) + if len(probabilities) != dataset.n_samples: + raise ValidationError( + f'Check requires classification probabilities for the "{dataset.name}" ' + f'dataset to have {dataset.n_samples} rows, same as dataset' + ) + try: + probabilities = np.array(probabilities, dtype='float') + except ValueError as e: + raise ValidationError( + 'Failed to cast probabilities to a numpy array. ' + f'{format_error_message}' + ) from e + else: + if len(probabilities.shape) != 2: + raise ValidationError(format_error_message) + if n_of_classes is not None and probabilities.shape[1] != n_of_classes: + raise ValidationError( + f'Check requires classification probabilities for the "{dataset.name}" dataset ' + f'to have {n_of_classes} columns, same as the number of classes' + ) + if any(abs(probabilities.sum(axis=1) - 1) > eps): + # TODO: better message + raise ValidationError( + f'Check requires classification probabilities for the "{dataset.name}" ' + f'dataset to be probabilities and sum to 1 for each row' + ) + + return predictions, probabilities + + +def _validate_multilabel( + *, + dataset: 'TextData', + predictions: Any = None, + probabilities: Any = None, + n_of_classes: Optional[int] = None, +) -> Tuple[ + Optional[np.ndarray], # predictions + Optional[np.ndarray], # probabilities +]: + if predictions is not None: + format_error_message = ( + 'Check requires multi-label classification predictions for ' + f'the "{dataset.name}" dataset to be of a type sequence[sequence[int]] ' + 'that can be cast to a 2D numpy array of a shape (n_samples, n_classes)' + ) + if not is_sequence_not_str(predictions): + raise ValidationError(format_error_message) + if len(predictions) != dataset.n_samples: + raise ValidationError( + 'Check requires multi-label classification predictions ' + f'for the "{dataset.name}" dataset to have {dataset.n_samples} rows, ' + 'same as dataset' + ) + try: + predictions = np.array(predictions).astype(float) + except ValueError as e: + raise ValidationError( + 'Failed to cast multi-label predictions to a numpy array. ' + f'{format_error_message}' + ) from e + else: + if predictions.ndim != 2: + raise ValidationError(format_error_message) + if n_of_classes is not None and predictions.shape[1] != n_of_classes: + raise ValidationError( + 'Check requires multi-label classification predictions ' + f'for the "{dataset.name}" dataset to have {n_of_classes} columns, ' + 'same as the number of classes' + ) + if not np.array_equal(predictions, predictions.astype(bool)): + raise ValidationError( + 'Check requires multi-label classification predictions ' + f'for the "{dataset.name}" dataset to be either 0 or 1' + ) + if probabilities is not None: + format_error_message = ( + 'Check requires multi-label classification probabilities ' + f'for the "{dataset.name}" to be of a type sequence[sequences[float]] ' + 'that can be cast to a 2D numpy array of a shape (n_samples, n_classes). ' + 'Each label probability value must lay between 0 and 1' + ) + if len(probabilities) != dataset.n_samples: + raise ValidationError( + 'Check requires multi-label classification probabilities ' + f'for the "{dataset.name}" dataset to have {dataset.n_samples} rows, ' + 'same as dataset' + ) + try: + probabilities = np.array(probabilities, dtype='float') + except ValueError as e: + raise ValidationError( + 'Failed to cast multi-label probabilities to a numpy ' + f'array. {format_error_message}' + ) from e + else: + if probabilities.ndim != 2: + raise ValidationError(format_error_message) + if n_of_classes is not None and probabilities.shape[1] != n_of_classes: + raise ValidationError( + f'Check requires multi-label classification probabilities ' + f'for the "{dataset.name}" dataset to have {n_of_classes} columns, ' + 'same as the number of classes' + ) + if (probabilities > 1).any() or (probabilities < 0).any(): + # TODO: better message + raise ValidationError(format_error_message) + + return predictions, probabilities + + +def _validate_token_classification( + *, + dataset: 'TextData', + predictions: Any = None, + probabilities: Any = None, +): + if probabilities is not None: + raise ValidationError( + 'For token classification probabilities are not supported' + ) + + if predictions is not None: + format_error_message = ( + 'Check requires token-classification predictions for ' + f'the "{dataset.name}" dataset to be of a type ' + 'sequence[sequence[str]] or sequence[sequence[int]]' + ) + if not is_sequence_not_str(predictions): + raise ValidationError(format_error_message) + if len(predictions) != dataset.n_samples: + raise ValidationError( + 'Check requires token-classification predictions for ' + f'the "{dataset.name}" dataset to have {dataset.n_samples} rows, ' + 'same as dataset' + ) + + for idx, sample_predictions in enumerate(predictions): + if not is_sequence_not_str(sample_predictions): + raise ValidationError(format_error_message) + + predictions_types_counter = _count_types(sample_predictions) + criterias = (str in predictions_types_counter, int in predictions_types_counter) + + if all(criterias) or not any(criterias): + raise ValidationError(format_error_message) + + tokenized_text = dataset.tokenized_text + + if len(sample_predictions) != len(tokenized_text[idx]): + raise ValidationError( + 'Check requires token-classification predictions for ' + f'the "{dataset.name}" dataset to have the same number of tokens ' + 'as the input text' + ) + + +def _count_types(sequence: Sequence[Any]) -> Dict[Type, int]: + counter = collections.defaultdict(int) + for it in sequence: + counter[type(it)] += 1 + return counter diff --git a/deepchecks/utils/typing.py b/deepchecks/utils/typing.py index 4722f74817..abc7af3a08 100644 --- a/deepchecks/utils/typing.py +++ b/deepchecks/utils/typing.py @@ -8,10 +8,10 @@ # along with Deepchecks. If not, see . # ---------------------------------------------------------------------------- # +# pylint: disable=invalid-hash-returned,invalid-name,unnecessary-ellipsis """Type definitions.""" from typing import List -# pylint: disable=invalid-hash-returned,invalid-name from typing_extensions import Protocol, runtime_checkable __all__ = ['Hashable', 'BasicModel', 'ClassificationModel'] @@ -46,6 +46,7 @@ class BasicModel(Protocol): def predict(self, X) -> List[Hashable]: """Predict on given X.""" + ... @runtime_checkable @@ -54,3 +55,4 @@ class ClassificationModel(BasicModel, Protocol): def predict_proba(self, X) -> List[Hashable]: """Predict probabilities on given X.""" + ... diff --git a/tests/nlp/checks/data_integrity/under_annotated_segments_test.py b/tests/nlp/checks/data_integrity/under_annotated_segments_test.py index b214de5cdd..b2c803f799 100644 --- a/tests/nlp/checks/data_integrity/under_annotated_segments_test.py +++ b/tests/nlp/checks/data_integrity/under_annotated_segments_test.py @@ -10,7 +10,7 @@ # """Test for the NLP UnderAnnotatedSegments check""" import numpy as np -from hamcrest import assert_that, close_to, equal_to, has_items, calling, raises +from hamcrest import assert_that, calling, close_to, equal_to, has_items, raises from deepchecks.core.errors import DeepchecksProcessError from deepchecks.nlp.checks import UnderAnnotatedMetaDataSegments, UnderAnnotatedPropertySegments diff --git a/tests/nlp/checks/model_evaluation/confusion_matrix_test.py b/tests/nlp/checks/model_evaluation/confusion_matrix_test.py index efbac75288..7c2a3c77d0 100644 --- a/tests/nlp/checks/model_evaluation/confusion_matrix_test.py +++ b/tests/nlp/checks/model_evaluation/confusion_matrix_test.py @@ -26,7 +26,7 @@ def test_defaults(text_classification_dataset_mock): # Act result = check.run(text_classification_dataset_mock, predictions=['0', '1', '1']) - + confusion_matrix = result.value.to_numpy() # Assert @@ -58,7 +58,7 @@ def test_run_default_scorer_string_class_new_cats_in_model_classes(text_classifi # Act result = check.run(text_classification_string_class_dataset_mock, predictions=['wise', 'new', 'meh']) - + confusion_matrix = result.value.to_numpy() # Assert @@ -179,7 +179,7 @@ def test_condition_misclassified_samples_lower_than_fails(tweet_emotion_train_te x, y = max_misclassified_cell_idx max_misclassified_samples = confusion_matrix[x][y] max_misclassified_samples_ratio = max_misclassified_samples / len(test_ds) - + # Assert assert_that(result.conditions_results[0], equal_condition_result( is_pass=False, diff --git a/tests/nlp/checks/model_evaluation/single_dataset_performance_test.py b/tests/nlp/checks/model_evaluation/single_dataset_performance_test.py index 3ad5b8eeb1..33bf4bf17f 100644 --- a/tests/nlp/checks/model_evaluation/single_dataset_performance_test.py +++ b/tests/nlp/checks/model_evaluation/single_dataset_performance_test.py @@ -49,11 +49,13 @@ def test_run_with_scorer_proba_too_many_classes(text_classification_string_class # Act & Assert assert_that( - calling(check.run).with_args(text_classification_string_class_dataset_mock, - probabilities=[[0.1, 0.4, 0.5], [0.9, 0.05, 0.05], [0.9, 0.01, 0.09]], - ), - raises(ValidationError, 'Check requires classification probabilities for Train dataset to have 2 columns, ' - 'same as the number of classes') + calling(check.run).with_args( + text_classification_string_class_dataset_mock, + probabilities=[[0.1, 0.4, 0.5], [0.9, 0.05, 0.05], [0.9, 0.01, 0.09]]), + raises( + ValidationError, + 'Check requires classification probabilities for the "Train" dataset to have 2 columns, ' + 'same as the number of classes') ) diff --git a/tests/nlp/conftest.py b/tests/nlp/conftest.py index bf3476991f..e6543ca4f8 100644 --- a/tests/nlp/conftest.py +++ b/tests/nlp/conftest.py @@ -144,8 +144,8 @@ def text_token_classification_dataset_mock(): def multilabel_mock_dataset_and_probabilities(tweet_emotion_train_test_textdata): """Mock dataset and probabilities for multilabel classification""" from sklearn.datasets import make_multilabel_classification - from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression + from sklearn.model_selection import train_test_split X, y = make_multilabel_classification(n_samples=3_000, n_features=10, n_classes=3, n_labels=2, random_state=42) diff --git a/tests/nlp/test_context.py b/tests/nlp/test_context.py index ab8cbf1802..795ed56726 100644 --- a/tests/nlp/test_context.py +++ b/tests/nlp/test_context.py @@ -26,44 +26,58 @@ def test_wrong_prediction_format(text_classification_dataset_mock): emtpy_suite = Suite('Empty Suite') # Act & Assert - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_classification_dataset_mock, - train_predictions=[0, 0, 1, 1]), - raises(ValidationError, 'Check requires predictions for Train to have 3 rows, same as dataset') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_classification_dataset_mock, + train_predictions=[0, 0, 1, 1]), + raises( + ValidationError, + 'Check requires predictions for the "Train" dataset ' + 'to have 3 rows, same as dataset') ) - - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_classification_dataset_mock, - train_predictions=[[0, 1], [1, 1], [0, 0]]), - raises(ValidationError, CLASSIFICATION_ERROR_FORMAT) + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_classification_dataset_mock, + train_predictions=[[0, 1], [1, 1], [0, 0]]), + raises( + ValidationError, + r'Check requires predictions for the "Train" dataset to ' + r'be of a type sequence\[str\] \| sequence\[int\]') ) - - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_classification_dataset_mock, - train_probabilities=[[0.3, 0.5, 0.2], [0.3, 0.5, 0.2]]), - raises(ValidationError, 'Check requires classification probabilities for Train dataset to have 3 rows,' - ' same as dataset') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_classification_dataset_mock, + train_probabilities=[[0.3, 0.5, 0.2], [0.3, 0.5, 0.2]]), + raises( + ValidationError, + 'Check requires classification probabilities for the "Train" ' + 'dataset to have 3 rows, same as dataset') ) - - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_classification_dataset_mock, - train_probabilities=[[1, 1, 1], [0, 0, 0], [0.5, 0.5, 0.5]]), - raises(ValidationError, 'Check requires classification probabilities for Train dataset to have 2 columns, ' - 'same as the number of classes') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_classification_dataset_mock, + train_probabilities=[[1, 1, 1], [0, 0, 0], [0.5, 0.5, 0.5]]), + raises( + ValidationError, + 'Check requires classification probabilities for the "Train" ' + 'dataset to have 2 columns, same as the number of classes') ) - - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_classification_dataset_mock, - train_probabilities=[[1, 1], [0, 0], [0.5, 0.2]]), - raises(ValidationError, 'Check requires classification probabilities for Train dataset to be probabilities and' - ' sum to 1 for each row') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_classification_dataset_mock, + train_probabilities=[[1, 1], [0, 0], [0.5, 0.2]]), + raises( + ValidationError, + 'Check requires classification probabilities for the "Train" ' + 'dataset to be probabilities and sum to 1 for each row') ) # Run with no error emtpy_suite.run( train_dataset=text_classification_dataset_mock, train_predictions=[1, 1, 1], - train_probabilities=[[0.9, 0.1], [1, 0], [0.5, 0.5]]) + train_probabilities=[[0.9, 0.1], [1, 0], [0.5, 0.5]] + ) def test_wrong_multilabel_prediction_format(text_multilabel_classification_dataset_mock): @@ -71,49 +85,75 @@ def test_wrong_multilabel_prediction_format(text_multilabel_classification_datas emtpy_suite = Suite('Empty Suite') # Act & Assert - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_multilabel_classification_dataset_mock, - train_predictions=[0, 0, 1, 1]), - raises(ValidationError, 'Check requires predictions for Train to have 3 rows, same as dataset') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_multilabel_classification_dataset_mock, + train_predictions=[ + [0, 0, 0], + [0, 0, 0], + [0, 0, 0], + [0, 0, 0],]), + raises( + ValidationError, + 'Check requires multi-label classification predictions for ' + 'the "Train" dataset to have 3 rows, same as dataset') ) - - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_multilabel_classification_dataset_mock, - train_predictions=[0, 1, 1]), - raises(ValidationError, CLASSIFICATION_ERROR_FORMAT) + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_multilabel_classification_dataset_mock, + train_predictions=[0, 1, 1]), + raises( + ValidationError, + r'Check requires multi-label classification predictions for ' + r'the "Train" dataset to be of a type sequence\[sequence\[int\]\] that can ' + r'be cast to a 2D numpy array of a shape \(n_samples, n_classes\)') ) - - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_multilabel_classification_dataset_mock, - train_predictions=[[0], [0, 1], 1]), - raises(ValidationError, CLASSIFICATION_ERROR_FORMAT) + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_multilabel_classification_dataset_mock, + train_predictions=[[0], [0, 1], 1]), + raises( + ValidationError, + r'Check requires multi-label classification predictions for ' + r'the "Train" dataset to be of a type sequence\[sequence\[int\]\] that can ' + r'be cast to a 2D numpy array of a shape \(n_samples, n_classes\)') ) - - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_multilabel_classification_dataset_mock, - train_probabilities=[[0.3, 0.5, 0.2], [0.3, 0.5, 0.2]]), - raises(ValidationError, 'Check requires classification probabilities for Train dataset to have 3 rows,' - ' same as dataset') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_multilabel_classification_dataset_mock, + train_probabilities=[[0.3, 0.5, 0.2], [0.3, 0.5, 0.2]]), + raises( + ValidationError, + 'Check requires multi-label classification probabilities for ' + 'the "Train" dataset to have 3 rows, same as dataset') ) - - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_multilabel_classification_dataset_mock, - train_probabilities=[[1, 1], [0, 0], [0.5, 0.5]]), - raises(ValidationError, 'heck requires classification probabilities for Train dataset to have 3 columns, ' - 'same as the number of classes') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_multilabel_classification_dataset_mock, + train_probabilities=[[1, 1], [0, 0], [0.5, 0.5]]), + raises( + ValidationError, + 'Check requires multi-label classification probabilities for ' + 'the "Train" dataset to have 3 columns, same as the number of classes') ) - - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_multilabel_classification_dataset_mock, - train_probabilities=[[1, 1.2, 1], [0, 0, 0.3], [0.5, 0.2, 0.9]]), - raises(ValidationError, 'Check requires classification probabilities for Train dataset to be between 0 and 1') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_multilabel_classification_dataset_mock, + train_probabilities=[[1, 1.2, 1], [0, 0, 0.3], [0.5, 0.2, 0.9]]), + raises( + ValidationError, + r'Check requires multi-label classification probabilities for the "Train" ' + r'to be of a type sequence\[sequences\[float\]\] that can be cast to a 2D numpy ' + r'array of a shape \(n_samples, n_classes\). Each label probability value ' + r'must lay between 0 and 1') ) # Run with no error emtpy_suite.run( train_dataset=text_multilabel_classification_dataset_mock, train_predictions=[[1, 1, 0], [0, 0, 1], [1, 1, 1]], - train_probabilities=[[0.9, 0.8, 0.3], [0.9, 0.8, 0.3], [0.9, 0.8, 0.3]]) + train_probabilities=[[0.9, 0.8, 0.3], [0.9, 0.8, 0.3], [0.9, 0.8, 0.3]] + ) def test_wrong_token_prediction_format(text_token_classification_dataset_mock): @@ -123,47 +163,69 @@ def test_wrong_token_prediction_format(text_token_classification_dataset_mock): # Act & Assert # Length of predictions does not match length of dataset: - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_token_classification_dataset_mock, - train_predictions=[[1, 2], [3, 4]] - ), - raises(ValidationError, 'Check requires predictions for Train to have 3 rows, same as dataset') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_token_classification_dataset_mock, + train_predictions=[[1, 2], [3, 4]]), + raises( + ValidationError, + 'Check requires token-classification predictions for the "Train" ' + 'dataset to have 3 rows, same as dataset') ) # Not a list: - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_token_classification_dataset_mock, - train_predictions='PER' - ), - raises(ValidationError, 'Check requires predictions for Train to be a sequence') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_token_classification_dataset_mock, + train_predictions='PER'), + raises( + ValidationError, + r'Check requires token-classification predictions for ' + r'the "Train" dataset to be of a type sequence\[sequence\[str\]\] or ' + r'sequence\[sequence\[int\]\]') ) # Not a list of lists: - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_token_classification_dataset_mock, - train_predictions=[3, 3, 3] - ), - raises(ValidationError, 'Check requires predictions for Train to be a sequence of sequences') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_token_classification_dataset_mock, + train_predictions=[3, 3, 3]), + raises( + ValidationError, + r'Check requires token-classification predictions for the ' + r'"Train" dataset to be of a type sequence\[sequence\[str\]\] or ' + r'sequence\[sequence\[int\]\]') ) # Mixed strings and integers: - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_token_classification_dataset_mock, - train_predictions=[['B-PER', 'O', 1, 'O', 'O'], ['B-PER', 'O', 'O', 'B-GEO', 'O', 'B-GEO'], - ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']] - ), - raises(ValidationError, - 'Check requires predictions for Train to be a sequence of sequences of strings or integers') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_token_classification_dataset_mock, + train_predictions=[ + ['B-PER', 'O', 1, 'O', 'O'], + ['B-PER', 'O', 'O', 'B-GEO', 'O', 'B-GEO'], + ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] + ]), + raises( + ValidationError, + r'Check requires token-classification predictions for ' + r'the "Train" dataset to be of a type sequence\[sequence\[str\]\] ' + r'or sequence\[sequence\[int\]\]') ) # Length of predictions does not match length of tokenized text: - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_token_classification_dataset_mock, - train_predictions=[['B-PER'], ['B-PER', 'O', 'O', 'B-GEO', 'O', 'B-GEO'], - ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']] - ), - raises(ValidationError, - 'Check requires predictions for Train to have the same number of tokens as the input text') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_token_classification_dataset_mock, + train_predictions=[ + ['B-PER'], + ['B-PER', 'O', 'O', 'B-GEO', 'O', 'B-GEO'], + ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] + ]), + raises( + ValidationError, + 'Check requires token-classification predictions for the "Train" dataset ' + 'to have the same number of tokens as the input text') ) @@ -193,3 +255,13 @@ def test_same_dataset(tweet_emotion_train_test_textdata): # Assert assert_that(result.value['Drift score'], close_to(0.0, 0.001)) + + +def test_check_execution_with_none_value_in_predictions_sequence(text_classification_dataset_mock): + # Arrange + check = SingleDatasetPerformance(scorers=['recall_macro']) + # Act + result = check.run(text_classification_dataset_mock, predictions=[1, None, 1]) + # Assert + assert_that(result.value['Value'], close_to(0.5, 0.001)) + diff --git a/tests/nlp/utils/test_embeddings.py b/tests/nlp/utils/test_embeddings.py index 2e7a0c2d0c..018fdfc675 100644 --- a/tests/nlp/utils/test_embeddings.py +++ b/tests/nlp/utils/test_embeddings.py @@ -12,6 +12,7 @@ import numpy as np from hamcrest import assert_that, equal_to + from deepchecks.nlp.utils.text_embeddings import calculate_default_embeddings diff --git a/tests/tabular/checks/model_evaluation/confusion_matrix_report_test.py b/tests/tabular/checks/model_evaluation/confusion_matrix_report_test.py index 6c34aecfb5..610406253f 100644 --- a/tests/tabular/checks/model_evaluation/confusion_matrix_report_test.py +++ b/tests/tabular/checks/model_evaluation/confusion_matrix_report_test.py @@ -174,7 +174,7 @@ def test_condition_misclassified_samples_lower_than_fails(iris_split_dataset_and x, y = max_misclassified_cell_idx max_misclassified_samples = confusion_matrix[x][y] max_misclassified_samples_ratio = max_misclassified_samples / len(test) - + assert_that(result.conditions_results[0], equal_condition_result( is_pass=False, name=f'Misclassified cell size lower than {format_percent(threshold)} of the total samples', diff --git a/tests/utils/metrics_test.py b/tests/utils/metrics_test.py index 4e7f99d5b4..ec548a05f5 100644 --- a/tests/utils/metrics_test.py +++ b/tests/utils/metrics_test.py @@ -11,7 +11,7 @@ """Test metrics utils""" import pandas as pd from hamcrest import assert_that, calling, close_to, has_entries, is_, raises -from sklearn.metrics import make_scorer, log_loss, mean_squared_error +from sklearn.metrics import log_loss, make_scorer, mean_squared_error from deepchecks.core.errors import DeepchecksValueError from deepchecks.tabular import Dataset From 90d34eb0fa0f6fe6bf6f31e73b7e19b376264a0d Mon Sep 17 00:00:00 2001 From: Noam Bressler Date: Thu, 11 May 2023 09:53:29 +0300 Subject: [PATCH 02/20] update main to 0.14.0 (#2521) --- VERSION | 2 +- docs/source/_static/switcher.json | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/VERSION b/VERSION index 0cb517aa61..ea79e20ed4 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.13.1.dev1 \ No newline at end of file +0.14.0.dev1 \ No newline at end of file diff --git a/docs/source/_static/switcher.json b/docs/source/_static/switcher.json index 13b0c4e8b5..15ea2c33d1 100644 --- a/docs/source/_static/switcher.json +++ b/docs/source/_static/switcher.json @@ -6,9 +6,13 @@ }, { "version": "stable", - "name": "0.13 (stable)", + "name": "0.14 (stable)", "url": "https://docs.deepchecks.com/stable/" }, + { + "version": "0.14", + "url": "https://docs.deepchecks.com/0.14/" + }, { "version": "0.13", "url": "https://docs.deepchecks.com/0.13/" From fe3f69e61d2529fe6f1149a97971930516b36dc7 Mon Sep 17 00:00:00 2001 From: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com> Date: Thu, 11 May 2023 10:34:37 +0300 Subject: [PATCH 03/20] Plot files under annotated checks (#2516) --- .../under_annotated_segments.py | 10 +- deepchecks/nlp/utils/test_utils.py | 32 ++++++ .../plot_under_annotated_metadata_segments.py | 102 +++++++++++++++++ .../plot_under_annotated_property_segments.py | 103 ++++++++++++++++++ .../plot_metadata_segments_performance.py | 7 +- .../plot_property_segments_performance.py | 10 +- 6 files changed, 249 insertions(+), 15 deletions(-) create mode 100644 deepchecks/nlp/utils/test_utils.py create mode 100644 docs/source/checks/nlp/data_integrity/plot_under_annotated_metadata_segments.py create mode 100644 docs/source/checks/nlp/data_integrity/plot_under_annotated_property_segments.py diff --git a/deepchecks/nlp/checks/data_integrity/under_annotated_segments.py b/deepchecks/nlp/checks/data_integrity/under_annotated_segments.py index 53585d7625..8f64c62b98 100644 --- a/deepchecks/nlp/checks/data_integrity/under_annotated_segments.py +++ b/deepchecks/nlp/checks/data_integrity/under_annotated_segments.py @@ -20,6 +20,7 @@ from deepchecks.core.check_result import DisplayMap from deepchecks.core.errors import DeepchecksProcessError from deepchecks.nlp import Context, SingleDatasetCheck +from deepchecks.nlp.utils.text import break_to_lines_and_trim from deepchecks.nlp.utils.weak_segments import get_relevant_data_table from deepchecks.utils.abstracts.weak_segment_abstract import WeakSegmentAbstract from deepchecks.utils.metrics import is_label_none @@ -90,7 +91,7 @@ def _generate_scatter_plot_display(self, encoded_data: pd.DataFrame, is_annotate display_tabs = {} if weak_segments.shape[0] > self.n_to_show: weak_segments = weak_segments.iloc[:self.n_to_show, :] - encoded_data['text'] = text + encoded_data['text'] = [break_to_lines_and_trim(sample) for sample in text] # Handle categorical features jitter = 0.25 @@ -111,8 +112,8 @@ def _generate_scatter_plot_display(self, encoded_data: pd.DataFrame, is_annotate if feature_2 != '': # segment by two features feature_2_lower, feature_2_upper = self._get_box_boundaries(encoded_data[feature_2], row['Feature2 Range']) - hover_template = '' + feature_1 + ': %{x}
' + feature_2 + \ - ': %{y}
text: %{text}
Annotated: ' + hover_template = '' + feature_1 + ': %{x}
' + feature_2 + \ + ': %{y}
text: %{text}
Annotated: ' tab_title = f'{feature_1} vs {feature_2}' range_f1 = self._format_partition_vec_for_display([feature_1_lower, feature_1_upper], feature_1, ', ') range_f2 = self._format_partition_vec_for_display([feature_2_lower, feature_2_upper], feature_2, ', ') @@ -122,7 +123,7 @@ def _generate_scatter_plot_display(self, encoded_data: pd.DataFrame, is_annotate feature_2 = 'virtual_col' feature_2_lower = encoded_data['virtual_col'].min() * 1.3 feature_2_upper = encoded_data['virtual_col'].max() * 1.3 - hover_template = '' + feature_1 + ': %{x}
text: %{text}
Annotated: ' + hover_template = '' + feature_1 + ': %{x}
text: %{text}
Annotated: ' tab_title = feature_1 range_f1 = self._format_partition_vec_for_display([feature_1_lower, feature_1_upper], feature_1, ', ') msg = f'Under annotated segment contains samples with {feature_1} in {range_f1[0]}.' @@ -162,7 +163,6 @@ def _generate_scatter_plot_display(self, encoded_data: pd.DataFrame, is_annotate f' in whole data)', font=dict(size=24)), xaxis_title=feature_1, yaxis_title=feature_2 if feature_2 != 'virtual_col' else '', - autosize=False, width=1000, height=600, font=dict(size=14), plot_bgcolor='rgba(245, 245, 245, 1)', xaxis=dict(gridcolor='rgba(200, 200, 200, 0.5)', diff --git a/deepchecks/nlp/utils/test_utils.py b/deepchecks/nlp/utils/test_utils.py new file mode 100644 index 0000000000..19ada36f5f --- /dev/null +++ b/deepchecks/nlp/utils/test_utils.py @@ -0,0 +1,32 @@ +# ---------------------------------------------------------------------------- +# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com) +# +# This file is part of Deepchecks. +# Deepchecks is distributed under the terms of the GNU Affero General +# Public License (version 3 or later). +# You should have received a copy of the GNU Affero General Public License +# along with Deepchecks. If not, see . +# ---------------------------------------------------------------------------- +# +"""utils for testing.""" + +from deepchecks.nlp.datasets.classification import tweet_emotion + + +def load_modified_tweet_text_data(): + """Load tweet emotion data and modify the label of some samples.""" + text_data = tweet_emotion.load_data(as_train_test=False).copy() + + idx_to_change_metadata = list(text_data.metadata[(text_data.metadata['user_age'] > 40) & ( + text_data.metadata['user_region'] == 'Europe') & (text_data.metadata['user_age'] < 57)].index) + + idx_to_change_properties = list(text_data.properties[(text_data.properties['Formality'] > 0.4) & ( + text_data.properties['Text Length'] > 80) & (text_data.properties['Text Length'] < 130)].index) + + label = text_data._label.astype(object) # pylint: disable=protected-access + label[idx_to_change_metadata[int(len(idx_to_change_metadata) / 2):]] = None + label[idx_to_change_properties[:int(len(idx_to_change_properties) / 2)]] = None + + text_data._label = label # pylint: disable=protected-access + + return text_data diff --git a/docs/source/checks/nlp/data_integrity/plot_under_annotated_metadata_segments.py b/docs/source/checks/nlp/data_integrity/plot_under_annotated_metadata_segments.py new file mode 100644 index 0000000000..79627d3df6 --- /dev/null +++ b/docs/source/checks/nlp/data_integrity/plot_under_annotated_metadata_segments.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +""" + +.. _nlp__under_annotated_metadata_segments: + +Under Annotated Metadata Segments +********************************* + +This notebook provides an overview for using and understanding the under annotated metadata segments check. + +**Structure:** + +* `What is the purpose of the check? <#what-is-the-purpose-of-the-check>`__ +* `Automatically detecting under annotated segments <#automatically-detecting-under-annotated-segments>`__ +* `Generate data & model <#generate-data-model>`__ +* `Run the check <#run-the-check>`__ +* `Define a condition <#define-a-condition>`__ + +What is the purpose of the check? +================================== + +The Under-Annotated Metadata Segments check is designed to help you easily identify segments in your data which are +under-annotated compared to the rest of your dataset, based on the provided :ref:`metadata `. +The check could be very useful for example for identifying a specific data source for which there was +less labeled data. The check can be guided to run only on a specific list of metadata columns, +enabling you to focus on columns where you know a problem exists, or on important business segments. + +Automatically detecting under annotated segments +================================================ + +The check contains two main steps: + +#. We train multiple simple tree based models, each one is trained using exactly two + metadata columns (out of the ones selected above) to predict whether a sample will have a label. + +#. We extract the corresponding data samples for each of the leaves in each of the trees (data segments) and calculate + the annotation ratio in the samples within in. We keep the segments with the lowest annotation ratio. +""" +#%% +# Generate data & model +# ===================== + +from deepchecks.nlp.utils.test_utils import load_modified_tweet_text_data + +text_data = load_modified_tweet_text_data() +text_data.metadata.head(3) + +#%% +# Run the check +# ============= +# +# The check has several key parameters (that are all optional) that affect the behavior of the +# check and especially its output. +# +# ``columns / ignore_columns``: Controls which columns should be searched for under annotated segments. By default, +# uses all columns. +# +# ``segment_minimum_size_ratio``: Determines the minimum size of segments that are of interest. The check will +# return data segments that contain at least this fraction of the total data samples. It is recommended to +# try different configurations +# of this parameter as larger segments can be of interest even the model performance on them is superior. +# +# ``categorical_aggregation_threshold``: By default the check will combine rare categories into a single category called +# "Other". This parameter determines the frequency threshold for categories to be mapped into to the "other" category. +# +# see :class:`API reference ` for more details. + +from deepchecks.nlp.checks import UnderAnnotatedMetaDataSegments + +check = UnderAnnotatedMetaDataSegments(segment_minimum_size_ratio=0.07) +result = check.run(text_data) +result.show() + +#%% +# Observe the check's output +# -------------------------- +# +# We see in the results that the check indeed found several under annotated segments. +# In the scatter plot display we can see the under annotated segment as well as the annotation distribution with +# respect to the two metadata columns that are relevant to the segment. In order to get the full list +# of under annotated segments found we will inspect +# the ``result.value`` attribute. Shown below are the 3 segments with the worst performance. + + +result.value['weak_segments_list'].head(3) + +#%% +# Define a condition +# ================== +# +# We can add a condition that will validate the annotation ratio in all data segment is +# above a certain threshold. +# A scenario where this can be useful is when we want to make sure that we have enough annotations for quality +# evaluation of the model or drift on a subset of the data that is of interest to us, +# for example for specific age or gender groups. + +# Let's add a condition and re-run the check: + +check = UnderAnnotatedMetaDataSegments(segment_minimum_size_ratio=0.07) +check.add_condition_segments_annotation_ratio_greater_than(0.7) +result = check.run(text_data) +result.show(show_additional_outputs=False) diff --git a/docs/source/checks/nlp/data_integrity/plot_under_annotated_property_segments.py b/docs/source/checks/nlp/data_integrity/plot_under_annotated_property_segments.py new file mode 100644 index 0000000000..14d80f9cb9 --- /dev/null +++ b/docs/source/checks/nlp/data_integrity/plot_under_annotated_property_segments.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- +""" + +.. _nlp__under_annotated_property_segments: + +Under Annotated Property Segments +********************************* + +This notebook provides an overview for using and understanding the under annotated property segments check. + +**Structure:** + +* `What is the purpose of the check? <#what-is-the-purpose-of-the-check>`__ +* `Automatically detecting under annotated segments <#automatically-detecting-under-annotated-segments>`__ +* `Generate data & model <#generate-data-model>`__ +* `Run the check <#run-the-check>`__ +* `Define a condition <#define-a-condition>`__ + +What is the purpose of the check? +================================== + +The Under-Annotated Property Segments check is designed to help you easily identify segments in your data which are +under-annotated compared to the rest of your dataset, based on the provided :ref:`properties `. +The check could be very useful in identifying a specific data samples (for example less fluent or less formal samples) +for which there was a problem in the annotation process. +The check can be guided to run only on a specific list of properties, +enabling you to focus on properties where you know an issue exists, or on important business segments. + +Automatically detecting under annotated segments +================================================ + +The check contains two main steps: + +#. We train multiple simple tree based models, each one is trained using exactly two + properties (out of the ones selected above) to predict whether a sample will have a label. + +#. We extract the corresponding data samples for each of the leaves in each of the trees (data segments) and calculate + the annotation ratio in the samples within in. We keep the segments with the lowest annotation ratio. +""" +#%% +# Generate data & model +# ===================== + +from deepchecks.nlp.utils.test_utils import load_modified_tweet_text_data + +text_data = load_modified_tweet_text_data() +text_data.properties.head(3) + +#%% +# Run the check +# ============= +# +# The check has several key parameters (that are all optional) that affect the behavior of the +# check and especially its output. +# +# ``properties / ignore_properties``: Controls which properties should be searched for under +# annotated segments. By default, uses all properties. +# +# ``segment_minimum_size_ratio``: Determines the minimum size of segments that are of interest. The check will +# return data segments that contain at least this fraction of the total data samples. It is recommended to +# try different configurations +# of this parameter as larger segments can be of interest even the model performance on them is superior. +# +# ``categorical_aggregation_threshold``: By default the check will combine rare categories into a single category called +# "Other". This parameter determines the frequency threshold for categories to be mapped into to the "other" category. +# +# see :class:`API reference ` for more details. + +from deepchecks.nlp.checks import UnderAnnotatedPropertySegments + +check = UnderAnnotatedPropertySegments(segment_minimum_size_ratio=0.04) +result = check.run(text_data) +result.show() + +#%% +# Observe the check's output +# -------------------------- +# +# We see in the results that the check indeed found several under annotated segments. +# In the scatter plot display we can see the under annotated segment as well as the annotation distribution with +# respect to the two properties that are relevant to the segment. In order to get the full list +# of under annotated segments found we will inspect +# the ``result.value`` attribute. Shown below are the 3 segments with the worst performance. + + +result.value['weak_segments_list'].head(3) + +#%% +# Define a condition +# ================== +# +# We can add a condition that will validate the annotation ratio in all data segment is +# above a certain threshold. +# A scenario where this can be useful is when we want to make sure that we have enough annotations for quality +# evaluation of the model or drift on a subset of the data that is of interest to us, +# for example for specific age or gender groups. + +# Let's add a condition and re-run the check: + +check = UnderAnnotatedPropertySegments(segment_minimum_size_ratio=0.04) +check.add_condition_segments_annotation_ratio_greater_than(0.7) +result = check.run(text_data) +result.show(show_additional_outputs=False) diff --git a/docs/source/checks/nlp/model_evaluation/plot_metadata_segments_performance.py b/docs/source/checks/nlp/model_evaluation/plot_metadata_segments_performance.py index 7b1d30c939..69fff89220 100644 --- a/docs/source/checks/nlp/model_evaluation/plot_metadata_segments_performance.py +++ b/docs/source/checks/nlp/model_evaluation/plot_metadata_segments_performance.py @@ -20,7 +20,7 @@ ================================== The check is designed to help you easily identify the model's weakest segments based on the provided -:func:`metadata `. In addition, +:ref:`metadata `. In addition, it enables to provide a sublist of the metadata columns, thus limiting the check to search in interesting subspaces. @@ -32,11 +32,8 @@ #. We calculate loss for each sample in the dataset using the provided model via either log-loss or MSE according to the task type. -#. Select a subset of features for the weak segment search. This is done by selecting the features with the - highest feature importance to the model provided (within the features selected for check, if limited). - #. We train multiple simple tree based models, each one is trained using exactly two - features (out of the ones selected above) to predict the per sample error calculated before. + metadata columns (out of the ones selected above) to predict the per sample error calculated before. #. We extract the corresponding data samples for each of the leaves in each of the trees (data segments) and calculate the model performance on them. For the weakest data segments detected we also calculate the model's diff --git a/docs/source/checks/nlp/model_evaluation/plot_property_segments_performance.py b/docs/source/checks/nlp/model_evaluation/plot_property_segments_performance.py index d1c179ecbf..3a8ad17424 100644 --- a/docs/source/checks/nlp/model_evaluation/plot_property_segments_performance.py +++ b/docs/source/checks/nlp/model_evaluation/plot_property_segments_performance.py @@ -1,5 +1,8 @@ # -*- coding: utf-8 -*- """ +.. _nlp__property_segments_performance: + + Property Segments Performance ***************************** @@ -17,7 +20,7 @@ ================================= The check is designed to help you easily identify the model's weakest segments based on the provided -:func:`properties `. In addition, +:ref:`properties `. In addition, it enables to provide a sublist of the metadata columns, thus limiting the check to search in interesting subspaces. @@ -29,11 +32,8 @@ #. We calculate loss for each sample in the dataset using the provided model via either log-loss or MSE according to the task type. -#. Select a subset of features for the weak segment search. This is done by selecting the features with the - highest feature importance to the model provided (within the features selected for check, if limited). - #. We train multiple simple tree based models, each one is trained using exactly two - features (out of the ones selected above) to predict the per sample error calculated before. + properties (out of the ones selected above) to predict the per sample error calculated before. #. We extract the corresponding data samples for each of the leaves in each of the trees (data segments) and calculate the model performance on them. For the weakest data segments detected we also calculate the model's From 3ebf01ccfecd4cd6021904c5f3c1d3e755ee26e2 Mon Sep 17 00:00:00 2001 From: Harsh Jain Date: Thu, 11 May 2023 15:25:24 +0530 Subject: [PATCH 04/20] Fixes #2454 Added Readability score and average sentence length text property (#2512) * Added Flesch reading ease and average sentence length text property --------- Co-authored-by: Harsh Jain Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com> --- deepchecks/nlp/utils/text_properties.py | 75 +++++++++++++++++-- .../nlp/usage_guides/nlp_properties.rst | 2 + .../under_annotated_segments_test.py | 6 +- .../property_drift_test.py | 5 +- tests/nlp/test_text_data.py | 7 +- tests/nlp/utils/test_properties.py | 30 ++++++++ 6 files changed, 111 insertions(+), 14 deletions(-) diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py index 833d1ab1a3..345c36c61c 100644 --- a/deepchecks/nlp/utils/text_properties.py +++ b/deepchecks/nlp/utils/text_properties.py @@ -19,8 +19,11 @@ import pandas as pd import requests import textblob +from nltk import corpus from nltk import download as nltk_download +from nltk import sent_tokenize, word_tokenize +from deepchecks.nlp.utils.text import remove_punctuation from deepchecks.utils.function import run_available_kwargs __all__ = ['calculate_default_properties'] @@ -304,7 +307,7 @@ def lexical_density(raw_text: Sequence[str]) -> List[str]: """ if not nltk_download('punkt', quiet=True): warnings.warn('nltk punkt not found, lexical density cannot be calculated.' - ' Please check your internet connection.') + ' Please check your internet connection.', UserWarning) return [np.nan] * len(raw_text) result = [] for text in raw_text: @@ -323,7 +326,7 @@ def unique_noun_count(raw_text: Sequence[str]) -> List[str]: """Return a list of integers of number of unique noun words in the text.""" if not nltk_download('averaged_perceptron_tagger', quiet=True): warnings.warn('nltk averaged_perceptron_tagger not found, unique noun count cannot be calculated.' - ' Please check your internet connection.') + ' Please check your internet connection.', UserWarning) return [np.nan] * len(raw_text) result = [] for text in raw_text: @@ -335,6 +338,63 @@ def unique_noun_count(raw_text: Sequence[str]) -> List[str]: return result +def readability_score(raw_text: Sequence[str]) -> List[str]: + """Return a list of floats of Flesch Reading-Ease score per text sample. + + In the Flesch reading-ease test, higher scores indicate material that is easier to read + whereas lower numbers mark texts that are more difficult to read. For more information: + https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease + """ + if not nltk_download('punkt', quiet=True): + warnings.warn('nltk punkt not found, readability score cannot be calculated.' + ' Please check your internet connection.', UserWarning) + return [np.nan] * len(raw_text) + if not nltk_download('cmudict', quiet=True): + warnings.warn('nltk cmudict not found, readability score cannot be calculated.' + ' Please check your internet connection.', UserWarning) + return [np.nan] * len(raw_text) + result = [] + cmudict_dict = corpus.cmudict.dict() + for text in raw_text: + if not pd.isna(text): + sentence_count = len(sent_tokenize(text)) + text = remove_punctuation(text) + words = word_tokenize(text) + word_count = len(words) + syllable_count = sum([len(cmudict_dict[word.lower()]) for word in words if word.lower() in cmudict_dict]) + if word_count != 0 and sentence_count != 0 and syllable_count != 0: + avg_syllables_per_word = syllable_count / word_count + avg_words_per_sentence = word_count / sentence_count + flesch_reading_ease = 206.835 - (1.015 * avg_words_per_sentence) - (84.6 * avg_syllables_per_word) + result.append(round(flesch_reading_ease, 3)) + else: + result.append(np.nan) + else: + result.append(np.nan) + return result + + +def average_sentence_length(raw_text: Sequence[str]) -> List[str]: + """Return a list of floats denoting the average sentence length per text sample.""" + if not nltk_download('punkt', quiet=True): + warnings.warn('nltk punkt not found, average sentence length cannot be calculated.' + ' Please check your internet connection.', UserWarning) + return [np.nan] * len(raw_text) + result = [] + for text in raw_text: + if not pd.isna(text): + sentences = [remove_punctuation(sent) for sent in sent_tokenize(text)] + total_words = sum([len(word_tokenize(sentence)) for sentence in sentences]) + if len(sentences) != 0: + asl = total_words / len(sentences) + result.append(round(asl, 0)) + else: + result.append(np.nan) + else: + result.append(np.nan) + return result + + DEFAULT_PROPERTIES = ( {'name': 'Text Length', 'method': text_length, 'output_type': 'numeric'}, {'name': 'Average Word Length', 'method': average_word_length, 'output_type': 'numeric'}, @@ -348,6 +408,8 @@ def unique_noun_count(raw_text: Sequence[str]) -> List[str]: {'name': 'Formality', 'method': formality, 'output_type': 'numeric'}, {'name': 'Lexical Density', 'method': lexical_density, 'output_type': 'numeric'}, {'name': 'Unique Noun Count', 'method': unique_noun_count, 'output_type': 'numeric'}, + {'name': 'Readability Score', 'method': readability_score, 'output_type': 'numeric'}, + {'name': 'Average Sentence Length', 'method': average_sentence_length, 'output_type': 'numeric'}, ) LONG_RUN_PROPERTIES = ['Toxicity', 'Fluency', 'Formality', 'Unique Noun Count'] @@ -395,10 +457,11 @@ def calculate_default_properties( The properties to calculate. If None, all default properties will be calculated. Cannot be used together with ignore_properties parameter. Available properties are: ['Text Length', 'Average Word Length', 'Max Word Length', '% Special Characters', 'Language', - 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Lexical Density', 'Unique Noun Count'] - Note that the properties ['Toxicity', 'Fluency', 'Formality', 'Language'] may take a long time to calculate. If - include_long_calculation_properties is False, these properties will be ignored, even if they are in the - include_properties parameter. + 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Lexical Density', 'Unique Noun Count', + 'Readability Score', 'Average Sentence Length'] + Note that the properties ['Toxicity', 'Fluency', 'Formality', 'Language', 'Unique Noun Count'] may + take a long time to calculate. If include_long_calculation_properties is False, these properties will be + ignored, even if they are in the include_properties parameter. ignore_properties : List[str], default None The properties to ignore. If None, no properties will be ignored. Cannot be used together with properties parameter. diff --git a/docs/source/nlp/usage_guides/nlp_properties.rst b/docs/source/nlp/usage_guides/nlp_properties.rst index b200a8a011..f1afa8c760 100644 --- a/docs/source/nlp/usage_guides/nlp_properties.rst +++ b/docs/source/nlp/usage_guides/nlp_properties.rst @@ -59,6 +59,8 @@ Fluency* Fluency of the text. Uses the prithivida/parrot_ Formality* Formality of the text. Uses the s-nlp/roberta-base-formality-ranker model Lexical Density Percentage of unique words in the text, rounded up to 2 decimal digits Unique Noun Count* Number of unique noun words in the text +Readability Score A score calculated based on Flesch reading-ease per text sample. For more information: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease +Average Sentence Length Average number of words per sentence in the text ============================== ========== *These properties are not calculated by default, as they may take a long time to calculate. To use them, pass diff --git a/tests/nlp/checks/data_integrity/under_annotated_segments_test.py b/tests/nlp/checks/data_integrity/under_annotated_segments_test.py index b2c803f799..3c88b13f91 100644 --- a/tests/nlp/checks/data_integrity/under_annotated_segments_test.py +++ b/tests/nlp/checks/data_integrity/under_annotated_segments_test.py @@ -113,15 +113,15 @@ def test_token_classification_dataset(small_wikiann_train_test_text_data): # Assert assert_that(condition_result, has_items( equal_condition_result(is_pass=False, - details='Found a segment with annotation ratio of 0.375 in comparison to an ' + details='Found a segment with annotation ratio of 0.2 in comparison to an ' 'average score of 0.8 in sampled data.', name='The relative performance of weakest segment is greater than 80% of average model ' 'performance.') )) assert_that(result.value['avg_score'], close_to(0.8, 0.001)) - assert_that(len(result.value['weak_segments_list']), equal_to(15)) - assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.375, 0.01)) + assert_that(len(result.value['weak_segments_list']), equal_to(25)) + assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.2, 0.01)) def test_multilabel_dataset(multilabel_mock_dataset_and_probabilities): diff --git a/tests/nlp/checks/train_test_validation/property_drift_test.py b/tests/nlp/checks/train_test_validation/property_drift_test.py index 952582d041..0de5e5589e 100644 --- a/tests/nlp/checks/train_test_validation/property_drift_test.py +++ b/tests/nlp/checks/train_test_validation/property_drift_test.py @@ -171,8 +171,9 @@ def test_without_drift(self, dummy_multilabel_textdata_train_test): def test_with_drift(self, dummy_multilabel_textdata_train_test): # Arrange train, test = dummy_multilabel_textdata_train_test - train.calculate_default_properties(ignore_properties=['Lexical Density','Unique Noun Count']) - test.calculate_default_properties(ignore_properties=['Lexical Density','Unique Noun Count']) + properties_to_ignore = ['Lexical Density','Unique Noun Count', 'Average Sentence Length', 'Readability Score'] + train.calculate_default_properties(ignore_properties=properties_to_ignore) + test.calculate_default_properties(ignore_properties=properties_to_ignore) check = PropertyDrift(min_samples=20).add_condition_drift_score_less_than(max_allowed_numeric_score=0.3, max_allowed_categorical_score=0.3) # Act diff --git a/tests/nlp/test_text_data.py b/tests/nlp/test_text_data.py index 3eae2cd298..4ecfd1e0ef 100644 --- a/tests/nlp/test_text_data.py +++ b/tests/nlp/test_text_data.py @@ -157,11 +157,12 @@ def test_properties(text_classification_dataset_mock): dataset.calculate_default_properties(ignore_properties=['topic'] + LONG_RUN_PROPERTIES) properties = dataset.properties assert_that(properties.shape[0], equal_to(3)) - assert_that(properties.shape[1], equal_to(7)) + assert_that(properties.shape[1], equal_to(9)) assert_that(properties.columns, contains_exactly('Text Length', 'Average Word Length', 'Max Word Length', '% Special Characters', - 'Sentiment', 'Subjectivity', 'Lexical Density')) - assert_that(properties.iloc[0].values, contains_exactly(22, 3.6, 9, 0.0, 0.0, 0.0, 80.0 )) + 'Sentiment', 'Subjectivity', 'Lexical Density', 'Readability Score', + 'Average Sentence Length')) + assert_that(properties.iloc[0].values, contains_exactly(22, 3.6, 9, 0.0, 0.0, 0.0, 80.0, 100.24, 5)) def test_embeddings(): diff --git a/tests/nlp/utils/test_properties.py b/tests/nlp/utils/test_properties.py index 0115ad81ac..21099cde53 100644 --- a/tests/nlp/utils/test_properties.py +++ b/tests/nlp/utils/test_properties.py @@ -76,6 +76,36 @@ def test_calculate_unique_noun_count_property(tweet_emotion_train_test_textdata) assert_that(result_none_text['Unique Noun Count'], equal_to([np.nan])) +def test_calculate_average_sentence_length_property(tweet_emotion_train_test_textdata): + + # Arrange + _, test = tweet_emotion_train_test_textdata + test_text = test.text + + # Act + result = calculate_default_properties(test_text, include_properties=['Average Sentence Length'])[0] + result_none_text = calculate_default_properties([None], include_properties=['Average Sentence Length'])[0] + + # Assert + assert_that(result['Average Sentence Length'][0: 10], equal_to([6, 7, 11, 12, 8, 19, 3, 9, 12, 7])) + assert_that(result_none_text['Average Sentence Length'], equal_to([np.nan])) + + +def test_calculate_readability_score_property(tweet_emotion_train_test_textdata): + + # Arrange + _, test = tweet_emotion_train_test_textdata + test_text = test.text + + # Act + result = calculate_default_properties(test_text, include_properties=['Readability Score'])[0] + result_none_text = calculate_default_properties([None], include_properties=['Readability Score'])[0] + + # Assert + assert_that(result['Readability Score'][0: 10], equal_to([102.045, 97.001, 80.306, 67.755, 77.103, 71.782, 90.99, 75.5, 70.102, 95.564])) + assert_that(result_none_text['Readability Score'], equal_to([np.nan])) + + @pytest.mark.skipif( 'TEST_NLP_PROPERTIES_MODELS_DOWNLOAD' not in os.environ, reason='The test takes too long to run, provide env var if you want to run it.' From 63606deeeca86302a8d956734efc6b51af4ec4f1 Mon Sep 17 00:00:00 2001 From: JKL98ISR Date: Thu, 11 May 2023 14:29:06 +0300 Subject: [PATCH 05/20] fix version in docs (#2522) fix_version_docs --- docs/source/conf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index e935c890af..2652ea3ee7 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -49,9 +49,9 @@ version = 'dev' else: # Taking the major and minor version from the branch name - version_match = re.search(r'\d+(?:\.\d+)', os.environ.get("GITHUB_REF_NAME")) + version_match: re.Match = re.match(r'\d+(?:\.\d+)', os.environ.get("GITHUB_REF_NAME")) if version_match is not None: - version = version_match.string + version = version_match.group(0) version = version or VERSION language = os.environ.get("READTHEDOCS_LANGUAGE") From f205e23ffaa14e4a09862d7d2c1bea2241972543 Mon Sep 17 00:00:00 2001 From: Noam Bressler Date: Thu, 11 May 2023 19:37:00 +0300 Subject: [PATCH 06/20] change nlp to beta (#2527) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4acd5de43b..fd8bf5b1d6 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ data integrity, distribution mismatches, and more. **This README refers to the Tabular version** of deepchecks. - Check out the [Deepchecks for Computer Vision & Images subpackage](deepchecks/vision) for more details about deepchecks for CV, currently in *beta release*. -- Check out the [Deepchecks for NLP subpackage](deepchecks/nlp) for more details about deepchecks for NLP, currently in *alpha release*. +- Check out the [Deepchecks for NLP subpackage](deepchecks/nlp) for more details about deepchecks for NLP, currently in *beta release*. ## 💻 Installation From d6bab6affba64d53b565205816010fe64b6872b4 Mon Sep 17 00:00:00 2001 From: Nir Hutnik <92314933+nirhutnik@users.noreply.github.com> Date: Sun, 14 May 2023 14:01:52 +0300 Subject: [PATCH 07/20] Changed text embeddings to numpy array --- .../datasets/classification/tweet_emotion.py | 35 ++++++++++++------ deepchecks/nlp/input_validations.py | 26 +++++++++---- deepchecks/nlp/text_data.py | 37 +++++++++++-------- .../multivariate_embeddings_drift_utils.py | 32 ++++++++-------- deepchecks/nlp/utils/text_embeddings.py | 9 ++--- .../nlp/usage_guides/nlp_embeddings.rst | 4 +- .../text_embeddings_drift_test.py | 14 +++---- tests/nlp/test_datasets.py | 7 ++-- 8 files changed, 95 insertions(+), 69 deletions(-) diff --git a/deepchecks/nlp/datasets/classification/tweet_emotion.py b/deepchecks/nlp/datasets/classification/tweet_emotion.py index aa35ac9be9..2ccecbc838 100644 --- a/deepchecks/nlp/datasets/classification/tweet_emotion.py +++ b/deepchecks/nlp/datasets/classification/tweet_emotion.py @@ -20,16 +20,18 @@ import os import pathlib import typing as t +from io import BytesIO import numpy as np import pandas as pd +import requests from deepchecks.nlp import TextData __all__ = ['load_data', 'load_embeddings', 'load_precalculated_predictions'] _FULL_DATA_URL = 'https://ndownloader.figshare.com/files/39486889' -_EMBEDDINGS_URL = 'https://ndownloader.figshare.com/files/39729283' +_EMBEDDINGS_URL = 'https://ndownloader.figshare.com/files/40564880' _PROPERTIES_URL = 'https://ndownloader.figshare.com/files/39717619' _PREDICTIONS_URL = 'https://ndownloader.figshare.com/files/39264461' @@ -41,7 +43,7 @@ _CAT_PROPERTIES = ['Language'] -def load_embeddings(as_train_test: bool = True) -> t.Union[pd.DataFrame, t.Tuple[pd.DataFrame, pd.DataFrame]]: +def load_embeddings(as_train_test: bool = True) -> t.Union[np.array, t.Tuple[np.array, np.array]]: """Load and return the embeddings of the tweet_emotion dataset calculated by OpenAI. Parameters @@ -56,11 +58,11 @@ def load_embeddings(as_train_test: bool = True) -> t.Union[pd.DataFrame, t.Tuple embeddings : np.ndarray Embeddings for the tweet_emotion dataset. """ - all_embeddings = _read_and_save('tweet_emotion_embeddings.csv', _EMBEDDINGS_URL, to_numpy=False).drop( - columns=['train_test_split']) + all_embeddings = _read_and_save('tweet_emotion_embeddings.npy', _EMBEDDINGS_URL, file_type='npy') + if as_train_test: train_indexes, test_indexes = _get_train_test_indexes() - return all_embeddings.loc[train_indexes], all_embeddings.loc[test_indexes] + return all_embeddings[train_indexes], all_embeddings[test_indexes] else: return all_embeddings @@ -126,7 +128,7 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True, if data_format.lower() not in ['textdata', 'dataframe']: raise ValueError('data_format must be either "Dataset" or "Dataframe"') - data = _read_and_save('tweet_emotion_data.csv', _FULL_DATA_URL, to_numpy=False) + data = _read_and_save('tweet_emotion_data.csv', _FULL_DATA_URL) if not as_train_test: data.drop(columns=['train_test_split'], inplace=True) if data_format.lower() != 'textdata': @@ -183,7 +185,7 @@ def load_precalculated_predictions(pred_format: str = 'predictions', as_train_te The prediction of the data elements in the dataset. """ - all_preds = _read_and_save('tweet_emotion_probabilities.csv', _PREDICTIONS_URL) + all_preds = _read_and_save('tweet_emotion_probabilities.csv', _PREDICTIONS_URL, to_numpy=True) if pred_format == 'predictions': all_preds = np.array([_LABEL_MAP[x] for x in np.argmax(all_preds, axis=1)]) elif pred_format != 'probabilities': @@ -196,14 +198,25 @@ def load_precalculated_predictions(pred_format: str = 'predictions', as_train_te return all_preds -def _read_and_save(file_name, url_to_file, to_numpy=True): +def _read_and_save(file_name, url_to_file, file_type='csv', to_numpy=False): """Read a file from a url and save it to the assets' directory.""" os.makedirs(ASSETS_DIR, exist_ok=True) if (ASSETS_DIR / file_name).exists(): - data = pd.read_csv(ASSETS_DIR / file_name, index_col=0) + if file_type == 'csv': + data = pd.read_csv(ASSETS_DIR / file_name, index_col=0) + elif file_type == 'npy': + data = np.load(ASSETS_DIR / file_name) + else: + raise ValueError('file_type must be either "csv" or "npy"') else: - data = pd.read_csv(url_to_file, index_col=0) - data.to_csv(ASSETS_DIR / file_name) + if file_type == 'csv': + data = pd.read_csv(url_to_file, index_col=0) + data.to_csv(ASSETS_DIR / file_name) + elif file_type == 'npy': + data = np.load(BytesIO(requests.get(url_to_file).content)) + np.save(ASSETS_DIR / file_name, data) + else: + raise ValueError('file_type must be either "csv" or "npy"') if to_numpy: data = data.to_numpy() diff --git a/deepchecks/nlp/input_validations.py b/deepchecks/nlp/input_validations.py index e79a2b521c..01f3df9cf3 100644 --- a/deepchecks/nlp/input_validations.py +++ b/deepchecks/nlp/input_validations.py @@ -101,17 +101,17 @@ class ColumnTypes(NamedTuple): numerical_columns: List[str] -def validate_length_and_type(data_table: pd.DataFrame, data_table_name: str, expected_size: int): - """Validate length of data table and type.""" - if not isinstance(data_table, pd.DataFrame): +def validate_length_and_type_numpy_array(data: np.ndarray, data_name: str, expected_size: int): + """Validate length of numpy array and type.""" + if not isinstance(data, np.ndarray): raise DeepchecksValueError( - f'{data_table_name} type {type(data_table)} is not supported, ' - 'must be a pandas DataFrame' + f'{data_name} type {type(data)} is not supported, ' + 'must be a numpy array' ) - if len(data_table) != expected_size: + if len(data) != expected_size: raise DeepchecksValueError( - f'received {data_table_name} with {len(data_table)} rows, ' + f'received {data_name} with {len(data)} rows, ' f'expected {expected_size}' ) @@ -123,7 +123,17 @@ def validate_length_and_calculate_column_types( categorical_columns: Optional[Sequence[str]] = None ) -> ColumnTypes: """Validate length of data table and calculate column types.""" - validate_length_and_type(data_table, data_table_name, expected_size) + if not isinstance(data_table, pd.DataFrame): + raise DeepchecksValueError( + f'{data_table_name} type {type(data_table)} is not supported, ' + 'must be a pandas DataFrame' + ) + + if len(data_table) != expected_size: + raise DeepchecksValueError( + f'received {data_table_name} with {len(data_table)} rows, ' + f'expected {expected_size}' + ) if categorical_columns is None: # TODO: Add tests categorical_features = infer_categorical_features(data_table) diff --git a/deepchecks/nlp/text_data.py b/deepchecks/nlp/text_data.py index 069c55cafe..7ab1112fb0 100644 --- a/deepchecks/nlp/text_data.py +++ b/deepchecks/nlp/text_data.py @@ -18,8 +18,9 @@ import pandas as pd from deepchecks.core.errors import DeepchecksNotSupportedError, DeepchecksValueError -from deepchecks.nlp.input_validations import (validate_length_and_calculate_column_types, validate_length_and_type, - validate_modify_label, validate_raw_text, validate_tokenized_text) +from deepchecks.nlp.input_validations import (validate_length_and_calculate_column_types, + validate_length_and_type_numpy_array, validate_modify_label, + validate_raw_text, validate_tokenized_text) from deepchecks.nlp.task_type import TaskType, TTextLabel from deepchecks.nlp.utils.text_embeddings import calculate_default_embeddings from deepchecks.nlp.utils.text_properties import calculate_default_properties @@ -91,13 +92,14 @@ class TextData: categorical_properties : t.Optional[t.List[str]] , default: None The names of the categorical properties columns. If None, categorical properties columns are automatically inferred. Only relevant if properties is not None. - embeddings : t.Optional[Union[pd.DataFrame, np.ndarray, str]] , default: None - The text embeddings for the samples. Embeddings must be given as either a pandas DataFrame or a path to a pandas - DataFrame compatible csv file, with the rows representing each sample and columns representing the different - embeddings dimensions. If None, no embeddings are set. - The number of rows in the embeddings DataFrame must be equal to the number of samples in the dataset, and the - order of the rows must be the same as the order of the samples in the dataset. - In order to calculate the default embeddings, use the `TextData.calculate_default_embeddings` function after + embeddings : t.Optional[Union[np.ndarray, pd.DataFrame, str]], default: None + The text embeddings for the samples. Embeddings must be given as a numpy array (or a path to an .npy + file containing a numpy array) of shape (N, E), where N is the number of samples in the TextData object and E + is the number of embeddings dimensions. + The numpy array must be in the same order as the samples in the TextData. + If None, no embeddings are set. + + In order to use the default embeddings, use the `TextData.calculate_default_embeddings` function after the creation of the TextData object. For more on embeddings, see the :ref:`Text Embeddings Guide ` """ @@ -238,7 +240,7 @@ def copy(self: TDataset, rows_to_use: t.Optional[t.Sequence[int]] = None) -> TDa new_copy.set_properties(properties, self._cat_properties) if self._embeddings is not None: - embeddings = self._embeddings.iloc[rows_to_use, :] + embeddings = self._embeddings[rows_to_use] new_copy.set_embeddings(embeddings) new_copy._original_text_index = self._original_text_index[rows_to_use] @@ -305,11 +307,11 @@ def calculate_default_embeddings(self, model: str = 'miniLM', file_path: str = ' The path to save the embeddings to. """ if self._embeddings is not None: - warnings.warn('Properties already exist, overwriting them', UserWarning) + warnings.warn('Embeddings already exist, overwriting them', UserWarning) self._embeddings = calculate_default_embeddings(text=self.text, model=model, file_path=file_path) - def set_embeddings(self, embeddings: pd.DataFrame, verbose: bool = True): + def set_embeddings(self, embeddings: np.ndarray, verbose: bool = True): """Set the metadata of the dataset. Parameters @@ -322,12 +324,15 @@ def set_embeddings(self, embeddings: pd.DataFrame, verbose: bool = True): if self._embeddings is not None and verbose is True: warnings.warn('Embeddings already exist, overwriting it', UserWarning) - if isinstance(embeddings, np.ndarray): - embeddings = pd.DataFrame(embeddings) + if isinstance(embeddings, pd.DataFrame): + embeddings = embeddings.to_numpy() + + if isinstance(embeddings, str): + embeddings = np.load(embeddings) if embeddings is not None: - validate_length_and_type(embeddings, 'Embeddings', len(self)) - self._embeddings = embeddings.reset_index(drop=True) if isinstance(embeddings, pd.DataFrame) else None + validate_length_and_type_numpy_array(embeddings, 'Embeddings', len(self)) + self._embeddings = embeddings @property def metadata(self) -> pd.DataFrame: diff --git a/deepchecks/nlp/utils/multivariate_embeddings_drift_utils.py b/deepchecks/nlp/utils/multivariate_embeddings_drift_utils.py index 4e6dedae18..b6fd009cbd 100644 --- a/deepchecks/nlp/utils/multivariate_embeddings_drift_utils.py +++ b/deepchecks/nlp/utils/multivariate_embeddings_drift_utils.py @@ -21,7 +21,6 @@ from deepchecks.core.check_utils.multivariate_drift_utils import auc_to_drift_score, build_drift_plot from deepchecks.nlp import TextData from deepchecks.nlp.utils.nlp_plot import two_datasets_scatter_plot -from deepchecks.utils.dataframes import floatify_dataframe # Max number of samples to use for dimensionality reduction fit (to make calculation faster): SAMPLES_FOR_REDUCTION_FIT = 1000 @@ -32,21 +31,23 @@ def run_multivariable_drift_for_embeddings(train_dataset: TextData, test_dataset num_samples_in_display: int, dimension_reduction_method: str, with_display: bool): """Calculate multivariable drift on embeddings.""" + np.random.seed(random_state) + # sample train and test datasets equally train_sample = train_dataset.sample(sample_size, random_state=random_state) test_sample = test_dataset.sample(sample_size, random_state=random_state) - train_sample_df = train_sample.embeddings - test_sample_df = test_sample.embeddings + train_sample_embeddings = train_sample.embeddings + test_sample_embeddings = test_sample.embeddings # create new dataset, with label denoting whether sample belongs to test dataset - domain_class_df = pd.concat([train_sample_df, test_sample_df]).reset_index(drop=True) - domain_class_labels = pd.Series([0] * len(train_sample_df) + [1] * len(test_sample_df)) + domain_class_array = np.concatenate([train_sample_embeddings, test_sample_embeddings]) + domain_class_labels = pd.Series([0] * len(train_sample_embeddings) + [1] * len(test_sample_embeddings)) # reduce dimensionality of embeddings if needed. # skips if not required ('none') or if number of features is small enough (< 30) in 'auto' mode. use_reduction = not (dimension_reduction_method == 'none' or ( - dimension_reduction_method == 'auto' and domain_class_df.shape[1] < 30)) + dimension_reduction_method == 'auto' and domain_class_array.shape[1] < 30)) use_umap = dimension_reduction_method == 'umap' or (dimension_reduction_method == 'auto' and with_display) if use_reduction: @@ -56,17 +57,18 @@ def run_multivariable_drift_for_embeddings(train_dataset: TextData, test_dataset else: # Faster, but graph will look bad. reducer = PCA(n_components=10, random_state=random_state) - samples_for_reducer = min(SAMPLES_FOR_REDUCTION_FIT, len(domain_class_df)) - reducer.fit(domain_class_df.sample(samples_for_reducer, random_state=random_state)) - domain_class_df = pd.DataFrame(reducer.transform(domain_class_df), index=domain_class_df.index) + samples_for_reducer = min(SAMPLES_FOR_REDUCTION_FIT, len(domain_class_array)) + samples = np.random.choice(len(domain_class_array), samples_for_reducer, replace=False) + reducer.fit(domain_class_array[samples]) + domain_class_array = reducer.transform(domain_class_array) # update train and test samples with new reduced embeddings (used later in display) - new_embeddings_train = domain_class_df.iloc[:len(train_sample_df)] - new_embeddings_test = domain_class_df.iloc[len(train_sample_df):] + new_embeddings_train = domain_class_array[:len(train_sample_embeddings)] + new_embeddings_test = domain_class_array[len(train_sample_embeddings):] train_sample.set_embeddings(new_embeddings_train, verbose=False) test_sample.set_embeddings(new_embeddings_test, verbose=False) - x_train, x_test, y_train, y_test = train_test_split(floatify_dataframe(domain_class_df), domain_class_labels, + x_train, x_test, y_train, y_test = train_test_split(domain_class_array, domain_class_labels, stratify=domain_class_labels, random_state=random_state, test_size=test_size) @@ -81,8 +83,8 @@ def run_multivariable_drift_for_embeddings(train_dataset: TextData, test_dataset values_dict = {'domain_classifier_auc': domain_classifier_auc, 'domain_classifier_drift_score': drift_score} if with_display: - relevant_index_train = list(x_test[y_test == 0].index) - relevant_index_test = [x - len(train_sample_df) for x in x_test[y_test == 1].index] + relevant_index_train = list(y_test[y_test == 0].index) + relevant_index_test = [x - len(train_sample_embeddings) for x in y_test[y_test == 1].index] train_sample = train_sample.copy(rows_to_use=relevant_index_train) test_sample = test_sample.copy(rows_to_use=relevant_index_test) @@ -106,7 +108,7 @@ def run_multivariable_drift_for_embeddings(train_dataset: TextData, test_dataset def display_embeddings(train_dataset: TextData, test_dataset: TextData, random_state: int): """Display the embeddings with the domain classifier proba as the x-axis and the embeddings as the y-axis.""" - embeddings = pd.concat([train_dataset.embeddings, test_dataset.embeddings]) + embeddings = np.concatenate([train_dataset.embeddings, test_dataset.embeddings]) reducer = UMAP(n_components=2, n_neighbors=5, init='random', min_dist=1, random_state=random_state) reduced_embeddings = reducer.fit_transform(embeddings) diff --git a/deepchecks/nlp/utils/text_embeddings.py b/deepchecks/nlp/utils/text_embeddings.py index 9c6ec0af1d..14d9599010 100644 --- a/deepchecks/nlp/utils/text_embeddings.py +++ b/deepchecks/nlp/utils/text_embeddings.py @@ -12,12 +12,11 @@ from typing import Optional import numpy as np -import pandas as pd from tqdm import tqdm def calculate_default_embeddings(text: np.array, model: str = 'miniLM', - file_path: Optional[str] = 'embeddings.csv') -> pd.DataFrame: + file_path: Optional[str] = 'embeddings.npy') -> np.array: """ Get default embeddings for the dataset. @@ -34,7 +33,7 @@ def calculate_default_embeddings(text: np.array, model: str = 'miniLM', Returns ------- - pd.DataFrame + np.array The embeddings for the dataset. """ if model == 'miniLM': @@ -71,9 +70,9 @@ def _get_embedding_with_backoff(list_of_strings): embeddings.append(x['embedding']) else: raise ValueError(f'Unknown model type: {model}') - embeddings = pd.DataFrame(embeddings) + embeddings = np.array(embeddings) if file_path is not None: - embeddings.to_csv(file_path, index=False) + np.save(file_path, embeddings) return embeddings diff --git a/docs/source/nlp/usage_guides/nlp_embeddings.rst b/docs/source/nlp/usage_guides/nlp_embeddings.rst index c10dcd2e0c..78870eb68e 100644 --- a/docs/source/nlp/usage_guides/nlp_embeddings.rst +++ b/docs/source/nlp/usage_guides/nlp_embeddings.rst @@ -97,7 +97,7 @@ you can set the embeddings of the ``TextData`` object to use them by using one o #. After the initialization, call the ``set_embeddings`` method of the :class:`TextData ` object. -In both methods, you can pass the embeddings as a pandas DataFrame, or as a path to a csv file. For the correct format +In both methods, you can pass the embeddings as a numpy array, or as a path to an .npy file. For the correct format of the embeddings, see the :ref:`Pre-Calculated Embeddings Format` section. In the following example, we will pass pre-calculated embeddings to the ``TextData`` object in order to use the @@ -126,5 +126,3 @@ Pre-Calculated Embeddings Format The embeddings should be a numpy.ndarray of shape (N, E), where N is the number of samples in the :class:`TextData ` object and E is the number of embeddings dimensions. The numpy.ndarray must be in the same order as the samples in the TextData object. -Note that if you load the embeddings from a csv file, all columns (embedding dimensions) will be loaded and considered -as embeddings, so make sure not to include any other columns in the csv file such as the index column. \ No newline at end of file diff --git a/tests/nlp/checks/train_test_validation/text_embeddings_drift_test.py b/tests/nlp/checks/train_test_validation/text_embeddings_drift_test.py index 7cba1f81e1..5784d24341 100644 --- a/tests/nlp/checks/train_test_validation/text_embeddings_drift_test.py +++ b/tests/nlp/checks/train_test_validation/text_embeddings_drift_test.py @@ -46,21 +46,21 @@ def test_reduction_method(tweet_emotion_train_test_textdata_sampled): # Act result = check.run(train, test) - assert_that(result.value['domain_classifier_drift_score'], close_to(0.17, 0.01)) + assert_that(result.value['domain_classifier_drift_score'], close_to(0.11, 0.01)) # Make sure uses PCA with auto + with_display false: check = TextEmbeddingsDrift(dimension_reduction_method='auto') # Act result = check.run(train, test, with_display=False) - assert_that(result.value['domain_classifier_drift_score'], close_to(0.17, 0.01)) + assert_that(result.value['domain_classifier_drift_score'], close_to(0.11, 0.01)) # Make sure doesn't use embeddings if none: check = TextEmbeddingsDrift(dimension_reduction_method='none') # Act result = check.run(train, test) - assert_that(result.value['domain_classifier_drift_score'], close_to(0.14, 0.01)) + assert_that(result.value['domain_classifier_drift_score'], close_to(0.18, 0.01)) def test_max_drift_score_condition_pass(tweet_emotion_train_test_textdata_sampled): @@ -75,7 +75,7 @@ def test_max_drift_score_condition_pass(tweet_emotion_train_test_textdata_sample # Assert assert_that(condition_result, equal_condition_result( is_pass=True, - details='Found drift value of: 0.17, corresponding to a domain classifier AUC of: 0.58', + details='Found drift value of: 0.12, corresponding to a domain classifier AUC of: 0.56', name='Drift value is less than 0.25', )) @@ -83,7 +83,7 @@ def test_max_drift_score_condition_pass(tweet_emotion_train_test_textdata_sample def test_max_drift_score_condition_fail(tweet_emotion_train_test_textdata_sampled): # Arrange train, test = tweet_emotion_train_test_textdata_sampled - check = TextEmbeddingsDrift().add_condition_overall_drift_value_less_than(0.15) + check = TextEmbeddingsDrift().add_condition_overall_drift_value_less_than(0.1) # Act result = check.run(train, test, with_display=False) @@ -92,6 +92,6 @@ def test_max_drift_score_condition_fail(tweet_emotion_train_test_textdata_sample # Assert assert_that(condition_result, equal_condition_result( is_pass=False, - name='Drift value is less than 0.15', - details='Found drift value of: 0.17, corresponding to a domain classifier AUC of: 0.58' + name='Drift value is less than 0.1', + details='Found drift value of: 0.12, corresponding to a domain classifier AUC of: 0.56' )) diff --git a/tests/nlp/test_datasets.py b/tests/nlp/test_datasets.py index 9cdc99858c..03051576bb 100644 --- a/tests/nlp/test_datasets.py +++ b/tests/nlp/test_datasets.py @@ -44,7 +44,6 @@ def test_tweet_emotion(): assert_that(len(embeddings), equal_to(len(full))) assert_that(len(train_embeddings) + len(test_embeddings), equal_to(len(full))) assert_that(len(train_embeddings), equal_to(len(train))) - assert_that(embeddings.columns, contains_exactly(*[str(x) for x in range(1536)])) - assert_that(train_embeddings.columns, contains_exactly(*[str(x) for x in range(1536)])) - assert_that(test_embeddings.columns, contains_exactly(*[str(x) for x in range(1536)])) - + assert_that(embeddings.shape, contains_exactly(4653, 1536)) + assert_that(train_embeddings.shape, contains_exactly(2675, 1536)) + assert_that(test_embeddings.shape, contains_exactly(1978, 1536)) From 78124a9e0a0458442dc9c6b01943eef0e57fb0ac Mon Sep 17 00:00:00 2001 From: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com> Date: Mon, 15 May 2023 10:32:39 +0300 Subject: [PATCH 08/20] Nb/feat/support multi label (#2531) --- .../data_integrity/conflicting_labels.py | 10 +- .../model_evaluation/prediction_drift.py | 8 ++ .../train_test_validation/label_drift.py | 10 +- deepchecks/nlp/context.py | 14 ++- .../nlp/datasets/classification/__init__.py | 4 +- .../just_dance_comment_analysis.py | 112 ++++++++++++++++++ .../datasets/classification/tweet_emotion.py | 46 ++----- deepchecks/nlp/utils/text_properties.py | 11 +- .../utils/abstracts/prediction_drift.py | 4 +- deepchecks/utils/builtin_datasets_utils.py | 45 +++++++ .../utils/distribution/preprocessing.py | 8 ++ .../data_integrity/conflicting_labels_test.py | 2 +- .../model_evaluation/prediction_drift_test.py | 22 +++- .../train_test_validation/label_drift_test.py | 24 +++- tests/nlp/conftest.py | 10 +- tests/nlp/test_datasets.py | 16 ++- 16 files changed, 284 insertions(+), 62 deletions(-) create mode 100644 deepchecks/nlp/datasets/classification/just_dance_comment_analysis.py create mode 100644 deepchecks/utils/builtin_datasets_utils.py diff --git a/deepchecks/nlp/checks/data_integrity/conflicting_labels.py b/deepchecks/nlp/checks/data_integrity/conflicting_labels.py index 94c7a296eb..32ef5e257f 100644 --- a/deepchecks/nlp/checks/data_integrity/conflicting_labels.py +++ b/deepchecks/nlp/checks/data_integrity/conflicting_labels.py @@ -11,6 +11,7 @@ """Module contains Conflicting Labels check.""" import typing as t +import numpy as np import pandas as pd from deepchecks.core import CheckResult @@ -83,7 +84,8 @@ def _truncate_text(self, x: str) -> str: def run_logic(self, context: Context, dataset_kind) -> CheckResult: """Run check.""" - dataset = context.get_data_by_kind(dataset_kind).sample(self.n_samples, random_state=self.random_state) + dataset = context.get_data_by_kind(dataset_kind) + dataset = dataset.sample(self.n_samples, random_state=self.random_state, drop_na_label=True) dataset = t.cast(TextData, dataset) samples = dataset.text n_of_samples = len(samples) @@ -96,12 +98,14 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult: **self._text_normalization_kwargs )) - if dataset.task_type is TaskType.TOKEN_CLASSIFICATION or dataset.is_multi_label_classification(): + if dataset.task_type is TaskType.TOKEN_CLASSIFICATION: labels = [tuple(t.cast(t.Sequence[t.Any], it)) for it in dataset.label] + elif dataset.is_multi_label_classification(): + labels = [tuple(np.where(row == 1)[0]) for row in dataset.label] elif dataset.task_type is TaskType.TEXT_CLASSIFICATION: labels = dataset.label else: - raise DeepchecksValueError(f'Unknow task type - {dataset.task_type}') + raise DeepchecksValueError(f'Unknown task type - {dataset.task_type}') df = pd.DataFrame({ 'hash': samples_hashes, diff --git a/deepchecks/nlp/checks/model_evaluation/prediction_drift.py b/deepchecks/nlp/checks/model_evaluation/prediction_drift.py index 4c797ea56e..92276d5ae5 100644 --- a/deepchecks/nlp/checks/model_evaluation/prediction_drift.py +++ b/deepchecks/nlp/checks/model_evaluation/prediction_drift.py @@ -19,6 +19,8 @@ __all__ = ['PredictionDrift'] +from deepchecks.utils.distribution.preprocessing import convert_multi_label_to_multi_class + class PredictionDrift(PredictionDriftAbstract, TrainTestCheck): """ @@ -150,10 +152,16 @@ def run_logic(self, context: Context) -> CheckResult: # Flag for computing drift on the probabilities rather than the predicted labels proba_drift = ((len(context.model_classes) == 2) and (self.drift_mode == 'auto')) or \ (self.drift_mode == 'proba') + model_classes = context.model_classes if proba_drift: + if context.is_multi_label_task(): + raise DeepchecksValueError('Cannot use proba drift mode for multi-label tasks') train_prediction = np.array(model.predict_proba(train_dataset)) test_prediction = np.array(model.predict_proba(test_dataset)) + elif context.is_multi_label_task(): + train_prediction = convert_multi_label_to_multi_class(model.predict(train_dataset), model_classes) + test_prediction = convert_multi_label_to_multi_class(model.predict(test_dataset), model_classes) else: train_prediction = np.array(model.predict(train_dataset)).reshape((-1, 1)) test_prediction = np.array(model.predict(test_dataset)).reshape((-1, 1)) diff --git a/deepchecks/nlp/checks/train_test_validation/label_drift.py b/deepchecks/nlp/checks/train_test_validation/label_drift.py index 98ed014215..bb4693ba4f 100644 --- a/deepchecks/nlp/checks/train_test_validation/label_drift.py +++ b/deepchecks/nlp/checks/train_test_validation/label_drift.py @@ -14,6 +14,7 @@ from deepchecks.core import CheckResult from deepchecks.nlp import Context, TrainTestCheck from deepchecks.utils.abstracts.label_drift import LabelDriftAbstract +from deepchecks.utils.distribution.preprocessing import convert_multi_label_to_multi_class __all__ = ['LabelDrift'] @@ -113,5 +114,12 @@ def run_logic(self, context: Context) -> CheckResult: train_dataset = context.train.sample(self.n_samples, random_state=self.random_state) test_dataset = context.test.sample(self.n_samples, random_state=self.random_state) - return self._calculate_label_drift(train_dataset.label.flatten(), test_dataset.label.flatten(), 'Label', + if context.is_multi_label_task(): + train_labels = convert_multi_label_to_multi_class(train_dataset.label, context.model_classes).flatten() + test_labels = convert_multi_label_to_multi_class(test_dataset.label, context.model_classes).flatten() + else: + train_labels = train_dataset.label + test_labels = test_dataset.label + + return self._calculate_label_drift(train_labels, test_labels, 'Label', 'categorical', context.with_display, (train_dataset.name, test_dataset.name)) diff --git a/deepchecks/nlp/context.py b/deepchecks/nlp/context.py index 09f954f863..4dfb33f835 100644 --- a/deepchecks/nlp/context.py +++ b/deepchecks/nlp/context.py @@ -29,7 +29,7 @@ from deepchecks.tabular.utils.task_type import TaskType as TabularTaskType from deepchecks.utils.docref import doclink from deepchecks.utils.logger import get_logger -from deepchecks.utils.typing import BasicModel +from deepchecks.utils.typing import ClassificationModel from deepchecks.utils.validation import is_sequence_not_str __all__ = [ @@ -54,7 +54,7 @@ TTextProba = t.Sequence[t.Sequence[float]] -class _DummyModel(BasicModel): +class _DummyModel(ClassificationModel): """Dummy model class used for inference with static predictions from the user. Parameters @@ -398,11 +398,17 @@ def raise_if_token_classification_task(self, check=None): f'"{check_name}" is not supported for the "{task_type_name}" tasks' ) + def is_multi_label_task(self): + """Return whether the task is multi-label classification.""" + if self.task_type == TaskType.TEXT_CLASSIFICATION: + dataset = t.cast(TextData, self._train if self._train is not None else self._test) + return dataset.is_multi_label_classification() + return False + def raise_if_multi_label_task(self, check=None): """Raise an exception if it is a multi-label classification task.""" - dataset = t.cast(TextData, self._train if self._train is not None else self._test) check_name = type(check).__name__ if check else 'Check' - if dataset.is_multi_label_classification(): + if self.is_multi_label_task(): raise DeepchecksNotSupportedError( f'"{check_name}" is not supported for the multilable classification tasks' ) diff --git a/deepchecks/nlp/datasets/classification/__init__.py b/deepchecks/nlp/datasets/classification/__init__.py index e17614bc49..fe099cdb25 100644 --- a/deepchecks/nlp/datasets/classification/__init__.py +++ b/deepchecks/nlp/datasets/classification/__init__.py @@ -9,6 +9,6 @@ # ---------------------------------------------------------------------------- # """Module for working with pre-built classification datasets.""" -from . import tweet_emotion +from . import just_dance_comment_analysis, tweet_emotion -__all__ = ['tweet_emotion'] +__all__ = ['tweet_emotion', 'just_dance_comment_analysis'] diff --git a/deepchecks/nlp/datasets/classification/just_dance_comment_analysis.py b/deepchecks/nlp/datasets/classification/just_dance_comment_analysis.py new file mode 100644 index 0000000000..c918556ea9 --- /dev/null +++ b/deepchecks/nlp/datasets/classification/just_dance_comment_analysis.py @@ -0,0 +1,112 @@ +# ---------------------------------------------------------------------------- +# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com) +# +# This file is part of Deepchecks. +# Deepchecks is distributed under the terms of the GNU Affero General +# Public License (version 3 or later). +# You should have received a copy of the GNU Affero General Public License +# along with Deepchecks. If not, see . +# ---------------------------------------------------------------------------- +# +"""Dataset containing comments and metadata information for multilabel predictions for different properties of comments. + +The data has 216193 comments make on the just dance YouTube videos. It has metadata information about the date the +comment was written and the number of "likes" it got. It also has +42 multilabel binary target label columns, +referring to the category classification of the comment. + +This dataset is a modification of Just Dance @ YouTube dataset curated by the COIMBRA university, +For additional details about the dataset, please refer to the original source: +https://www.kaggle.com/datasets/renatojmsantos/just-dance-on-youtube. +Dataset used under the following license: https://creativecommons.org/licenses/by/4.0/ + +Original publication: +R. Santos, J. P. Arrais and P. A. Silva, "Analysing Games for Health through Users' Opinion Mining," +2021 IEEE 34th International Symposium on Computer-Based Medical Systems (CBMS), Aveiro, Portugal, 2021, pp. 319-323, +doi: 10.1109/CBMS52027.2021.00035. +""" +import pathlib +import typing as t + +import pandas as pd + +from deepchecks.nlp import TextData +from deepchecks.utils.builtin_datasets_utils import read_and_save_data + +__all__ = ['load_data'] + + +_FULL_DATA_URL = 'https://figshare.com/ndownloader/files/40564895' + + +ASSETS_DIR = pathlib.Path(__file__).absolute().parent.parent / 'assets' / 'just_dance_comment_analysis' + +_METADATA_COLS = ['likes', 'dateComment'] +_CAT_METADATA = [] +_CAT_PROPERTIES = ['Language'] +_TEXT_COL = 'originalText' + + +def load_data(data_format: str = 'TextData', as_train_test: bool = True, use_full_size: bool = False) -> \ + t.Union[t.Tuple, t.Union[TextData, pd.DataFrame]]: + """Load and returns the Just Dance Comment Analysis dataset (multi-label classification). + + Parameters + ---------- + data_format : str, default: 'TextData' + Represent the format of the returned value. Can be 'TextData'|'DataFrame' + 'TextData' will return the data as a TextData object + 'Dataframe' will return the data as a pandas DataFrame object + as_train_test : bool, default: True + If True, the returned data is split into train and test exactly like the toy model + was trained. The first return value is the train data and the second is the test data. + In order to get this model, call the load_fitted_model() function. + Otherwise, returns a single object. + use_full_size : bool, default: False + If True, the returned data will be the full dataset, otherwise returns a subset of the data. + + Returns + ------- + dataset : Union[TextData, pd.DataFrame] + the data object, corresponding to the data_format attribute. + train, test : Tuple[Union[TextData, pd.DataFrame],Union[TextData, pd.DataFrame] + tuple if as_train_test = True. Tuple of two objects represents the dataset split to train and test sets. + """ + if data_format.lower() not in ['textdata', 'dataframe']: + raise ValueError('data_format must be either "Dataset" or "Dataframe"') + + data = read_and_save_data(ASSETS_DIR, 'just_dance_data.csv', _FULL_DATA_URL, to_numpy=False) + data['dateComment'] = pd.to_datetime(data['dateComment']) + + if not as_train_test: + if not use_full_size: + data = data[(data['dateComment'] < '2013-01-01') | (data['dateComment'] >= '2021-01-01')] + if data_format.lower() != 'textdata': + return data + + label = data.drop(columns=[_TEXT_COL] + _METADATA_COLS).to_numpy().astype(int) + dataset = TextData(data[_TEXT_COL], label=label, task_type='text_classification', + metadata=data[_METADATA_COLS], categorical_metadata=_CAT_METADATA) + return dataset + + else: + if use_full_size: + train = data[data['dateComment'] < '2015-01-01'] + test = data[data['dateComment'] >= '2015-01-01'] + else: + train = data[data['dateComment'] < '2013-01-01'] + test = data[data['dateComment'] >= '2021-01-01'] + + if data_format.lower() != 'textdata': + return train, test + + train_metadata, test_metadata = train[_METADATA_COLS], test[_METADATA_COLS] + label_train = train.drop(columns=[_TEXT_COL] + _METADATA_COLS).to_numpy().astype(int) + label_test = test.drop(columns=[_TEXT_COL] + _METADATA_COLS).to_numpy().astype(int) + + train_ds = TextData(train[_TEXT_COL], label=label_train, task_type='text_classification', + metadata=train_metadata, categorical_metadata=_CAT_METADATA) + test_ds = TextData(test[_TEXT_COL], label=label_test, task_type='text_classification', + metadata=test_metadata, categorical_metadata=_CAT_METADATA) + + return train_ds, test_ds diff --git a/deepchecks/nlp/datasets/classification/tweet_emotion.py b/deepchecks/nlp/datasets/classification/tweet_emotion.py index 2ccecbc838..e7f3e23d79 100644 --- a/deepchecks/nlp/datasets/classification/tweet_emotion.py +++ b/deepchecks/nlp/datasets/classification/tweet_emotion.py @@ -17,16 +17,14 @@ Dataset originally published in "Semeval-2018 task 1: Affect in tweets" by Mohammad et al. (2018): https://aclanthology.org/S18-1001/. """ -import os import pathlib import typing as t -from io import BytesIO import numpy as np import pandas as pd -import requests from deepchecks.nlp import TextData +from deepchecks.utils.builtin_datasets_utils import read_and_save_data __all__ = ['load_data', 'load_embeddings', 'load_precalculated_predictions'] @@ -58,7 +56,8 @@ def load_embeddings(as_train_test: bool = True) -> t.Union[np.array, t.Tuple[np. embeddings : np.ndarray Embeddings for the tweet_emotion dataset. """ - all_embeddings = _read_and_save('tweet_emotion_embeddings.npy', _EMBEDDINGS_URL, file_type='npy') + all_embeddings = read_and_save_data(ASSETS_DIR, 'tweet_emotion_embeddings.npy', _EMBEDDINGS_URL, + file_type='npy', to_numpy=True) if as_train_test: train_indexes, test_indexes = _get_train_test_indexes() @@ -83,12 +82,7 @@ def load_properties(as_train_test: bool = True) -> t.Union[pd.DataFrame, t.Tuple properties : pd.DataFrame Properties for the tweet_emotion dataset. """ - if (ASSETS_DIR / 'tweet_emotion_properties.csv').exists(): - properties = pd.read_csv(ASSETS_DIR / 'tweet_emotion_properties.csv', index_col=0) - else: - properties = pd.read_csv(_PROPERTIES_URL, index_col=0) - properties.to_csv(ASSETS_DIR / 'tweet_emotion_properties.csv') - + properties = read_and_save_data(ASSETS_DIR, 'tweet_emotion_properties.csv', _PROPERTIES_URL, to_numpy=False) if as_train_test: train = properties[properties['train_test_split'] == 'Train'].drop(columns=['train_test_split']) test = properties[properties['train_test_split'] == 'Test'].drop(columns=['train_test_split']) @@ -128,7 +122,7 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True, if data_format.lower() not in ['textdata', 'dataframe']: raise ValueError('data_format must be either "Dataset" or "Dataframe"') - data = _read_and_save('tweet_emotion_data.csv', _FULL_DATA_URL) + data = read_and_save_data(ASSETS_DIR, 'tweet_emotion_data.csv', _FULL_DATA_URL, to_numpy=False) if not as_train_test: data.drop(columns=['train_test_split'], inplace=True) if data_format.lower() != 'textdata': @@ -165,7 +159,8 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True, return train_ds, test_ds -def load_precalculated_predictions(pred_format: str = 'predictions', as_train_test: bool = True) -> np.array: +def load_precalculated_predictions(pred_format: str = 'predictions', as_train_test: bool = True) -> \ + t.Union[np.array, t.Tuple[np.array, np.array]]: """Load and return a precalculated predictions for the dataset. Parameters @@ -185,7 +180,7 @@ def load_precalculated_predictions(pred_format: str = 'predictions', as_train_te The prediction of the data elements in the dataset. """ - all_preds = _read_and_save('tweet_emotion_probabilities.csv', _PREDICTIONS_URL, to_numpy=True) + all_preds = read_and_save_data(ASSETS_DIR, 'tweet_emotion_probabilities.csv', _PREDICTIONS_URL, to_numpy=True) if pred_format == 'predictions': all_preds = np.array([_LABEL_MAP[x] for x in np.argmax(all_preds, axis=1)]) elif pred_format != 'probabilities': @@ -198,31 +193,6 @@ def load_precalculated_predictions(pred_format: str = 'predictions', as_train_te return all_preds -def _read_and_save(file_name, url_to_file, file_type='csv', to_numpy=False): - """Read a file from a url and save it to the assets' directory.""" - os.makedirs(ASSETS_DIR, exist_ok=True) - if (ASSETS_DIR / file_name).exists(): - if file_type == 'csv': - data = pd.read_csv(ASSETS_DIR / file_name, index_col=0) - elif file_type == 'npy': - data = np.load(ASSETS_DIR / file_name) - else: - raise ValueError('file_type must be either "csv" or "npy"') - else: - if file_type == 'csv': - data = pd.read_csv(url_to_file, index_col=0) - data.to_csv(ASSETS_DIR / file_name) - elif file_type == 'npy': - data = np.load(BytesIO(requests.get(url_to_file).content)) - np.save(ASSETS_DIR / file_name, data) - else: - raise ValueError('file_type must be either "csv" or "npy"') - - if to_numpy: - data = data.to_numpy() - return data - - def _get_train_test_indexes() -> t.Tuple[np.array, np.array]: """Get the indexes of the train and test sets.""" if (ASSETS_DIR / 'tweet_emotion_data.csv').exists(): diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py index 345c36c61c..1de6e1679e 100644 --- a/deepchecks/nlp/utils/text_properties.py +++ b/deepchecks/nlp/utils/text_properties.py @@ -313,10 +313,12 @@ def lexical_density(raw_text: Sequence[str]) -> List[str]: for text in raw_text: if not pd.isna(text): all_words = textblob.TextBlob(text).words - total_words = len(all_words) - total_unique_words = len(set(all_words)) - text_lexical_density = round(total_unique_words * 100 / total_words, 2) - result.append(text_lexical_density) + if len(all_words) == 0: + result.append(np.nan) + else: + total_unique_words = len(set(all_words)) + text_lexical_density = round(total_unique_words * 100 / len(all_words), 2) + result.append(text_lexical_density) else: result.append(np.nan) return result @@ -482,6 +484,7 @@ def calculate_default_properties( Dict[str, str] A dictionary with the property name as key and the property's type as value. """ + raw_text = list(raw_text) default_text_properties = _get_default_properties( include_properties=include_properties, ignore_properties=ignore_properties diff --git a/deepchecks/utils/abstracts/prediction_drift.py b/deepchecks/utils/abstracts/prediction_drift.py index 8e2a7eded2..f197bf5db5 100644 --- a/deepchecks/utils/abstracts/prediction_drift.py +++ b/deepchecks/utils/abstracts/prediction_drift.py @@ -51,8 +51,8 @@ def _prediction_drift(self, train_prediction, test_prediction, model_classes, wi train prediction or probabilities test_prediction : np.ndarray test prediction or probabilities - model_classes : list - list of model classes + model_classes : List[str] + List of model classes names with_display : bool flag for displaying the prediction distribution graph proba_drift : bool diff --git a/deepchecks/utils/builtin_datasets_utils.py b/deepchecks/utils/builtin_datasets_utils.py new file mode 100644 index 0000000000..7cb09838b0 --- /dev/null +++ b/deepchecks/utils/builtin_datasets_utils.py @@ -0,0 +1,45 @@ +# ---------------------------------------------------------------------------- +# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com) +# +# This file is part of Deepchecks. +# Deepchecks is distributed under the terms of the GNU Affero General +# Public License (version 3 or later). +# You should have received a copy of the GNU Affero General Public License +# along with Deepchecks. If not, see . +# ---------------------------------------------------------------------------- +# +"""utils for loading saving and utilizing deepchecks built in datasets.""" +import os +from io import BytesIO + +import numpy as np +import pandas as pd +import requests + + +def read_and_save_data(assets_dir, file_name, url_to_file, file_type='csv', to_numpy=False): + """If the file exist reads it from the assets' directory, otherwise reads it from the url and saves it.""" + os.makedirs(assets_dir, exist_ok=True) + if (assets_dir / file_name).exists(): + if file_type == 'csv': + data = pd.read_csv(assets_dir / file_name, index_col=0) + elif file_type == 'npy': + data = np.load(assets_dir / file_name) + else: + raise ValueError('file_type must be either "csv" or "npy"') + else: + if file_type == 'csv': + data = pd.read_csv(url_to_file, index_col=0) + data.to_csv(assets_dir / file_name) + elif file_type == 'npy': + data = np.load(BytesIO(requests.get(url_to_file).content)) + np.save(assets_dir / file_name, data) + else: + raise ValueError('file_type must be either "csv" or "npy"') + + if to_numpy: + if isinstance(data, pd.DataFrame): + data = data.to_numpy() + elif not isinstance(data, np.ndarray): + raise ValueError(f'Unknown data type - {type(data)}. Must be either pandas.DataFrame or numpy.ndarray') + return data diff --git a/deepchecks/utils/distribution/preprocessing.py b/deepchecks/utils/distribution/preprocessing.py index 9db1306019..607defbc78 100644 --- a/deepchecks/utils/distribution/preprocessing.py +++ b/deepchecks/utils/distribution/preprocessing.py @@ -215,3 +215,11 @@ def value_frequency(x: Union[List, np.ndarray, pd.Series]) -> List[float]: total_occurrences = len(x) values_probabilities = list(map(lambda n: n / total_occurrences, x_values_counter.values())) return values_probabilities + + +def convert_multi_label_to_multi_class(predictions: np.ndarray, model_classes: List[str]) -> np.ndarray: + """Convert multi-label predictions to multi class format like predictions.""" + predictions = np.asarray(predictions) + samples_per_class = np.nansum(np.where(predictions is None, np.nan, predictions), axis=0) # Ignoring nan values + all_predictions = [[cls] * int(num_samples) for cls, num_samples in zip(model_classes, samples_per_class)] + return np.asarray([item for sublist in all_predictions for item in sublist]).reshape((-1, 1)) diff --git a/tests/nlp/checks/data_integrity/conflicting_labels_test.py b/tests/nlp/checks/data_integrity/conflicting_labels_test.py index 1e9e6d339d..d7df7c0ec8 100644 --- a/tests/nlp/checks/data_integrity/conflicting_labels_test.py +++ b/tests/nlp/checks/data_integrity/conflicting_labels_test.py @@ -154,7 +154,7 @@ def multilabel_dataset_with_conflicts() -> ProblematicDataset: # NOTE: # tests depend on items order in this list AmbiguousDuplicatVariant( - labels=[(0, 1, 0), (0, 1, 1)], + labels=[(1,), (1, 2)], sample_ids=[2, 4], text=[ "Errors should never pass silently.", diff --git a/tests/nlp/checks/model_evaluation/prediction_drift_test.py b/tests/nlp/checks/model_evaluation/prediction_drift_test.py index 7c28711bb1..df19466b91 100644 --- a/tests/nlp/checks/model_evaluation/prediction_drift_test.py +++ b/tests/nlp/checks/model_evaluation/prediction_drift_test.py @@ -9,7 +9,7 @@ # ---------------------------------------------------------------------------- # """Test for the NLP PredictionDrift check""" - +import numpy as np from hamcrest import assert_that, close_to, equal_to, has_items from deepchecks.nlp import TextData @@ -75,3 +75,23 @@ def test_tweet_emotion_no_drift_no_label(tweet_emotion_train_test_textdata, twee )) assert_that(result.value['Drift score'], equal_to(0)) + + +def test_just_dance_small_drift(just_dance_train_test_textdata_sampled): + # Arrange + train, test = just_dance_train_test_textdata_sampled + check = PredictionDrift().add_condition_drift_score_less_than(0.1) + + # Act + result = check.run(train, test, train_predictions=np.asarray(train.label), + test_predictions=np.asarray(test.label)) + condition_result = check.conditions_decision(result) + + # Assert + assert_that(condition_result, has_items( + equal_condition_result(is_pass=True, + details="Found model prediction Cramer's V drift score of 0.05", + name='Prediction drift score < 0.1') + )) + + assert_that(result.value['Drift score'], close_to(0.05, 0.01)) diff --git a/tests/nlp/checks/train_test_validation/label_drift_test.py b/tests/nlp/checks/train_test_validation/label_drift_test.py index 29e6acf9d5..1f0d23eb01 100644 --- a/tests/nlp/checks/train_test_validation/label_drift_test.py +++ b/tests/nlp/checks/train_test_validation/label_drift_test.py @@ -16,6 +16,24 @@ from tests.base.utils import equal_condition_result +def test_just_dance_small_drift(just_dance_train_test_textdata_sampled): + # Arrange + train, test = just_dance_train_test_textdata_sampled + check = LabelDrift().add_condition_drift_score_less_than(0.1) + # Act + result = check.run(train, test) + condition_result = check.conditions_decision(result) + + # Assert + assert_that(condition_result, has_items( + equal_condition_result(is_pass=True, + details="Label's drift score Cramer's V is 0.05", + name='Label drift score < 0.1') + )) + + assert_that(result.value['Drift score'], close_to(0.05, 0.01)) + + def test_tweet_emotion(tweet_emotion_train_test_textdata): # Arrange train, test = tweet_emotion_train_test_textdata @@ -63,9 +81,7 @@ def test_multi_label_without_drift(dummy_multilabel_textdata_train_test): # Assert assert_that(condition_result, has_items( equal_condition_result(is_pass=True, - details="Label's drift score Cramer's V is 0", + details="Label's drift score Cramer's V is 0.02", name='Label drift score < 0.15') )) - assert_that(result.value['Drift score'], close_to(0, 0.01)) - - + assert_that(result.value['Drift score'], close_to(0.02, 0.01)) diff --git a/tests/nlp/conftest.py b/tests/nlp/conftest.py index e6543ca4f8..fe348a384c 100644 --- a/tests/nlp/conftest.py +++ b/tests/nlp/conftest.py @@ -19,7 +19,7 @@ from nltk import download as nltk_download from nltk.corpus import movie_reviews -from deepchecks.nlp.datasets.classification import tweet_emotion +from deepchecks.nlp.datasets.classification import tweet_emotion, just_dance_comment_analysis from deepchecks.nlp.text_data import TextData @@ -38,6 +38,14 @@ def tweet_emotion_train_test_textdata(): include_embeddings=True) return train, test +@pytest.fixture(scope='session') +def just_dance_train_test_textdata_sampled(): + """Just Dance text multilabel classification dataset""" + train, test = just_dance_comment_analysis.load_data(data_format='TextData', as_train_test=True) + sampled_train = train.sample(500, random_state=42) + sampled_test = test.sample(500, random_state=42) + return sampled_train, sampled_test + @pytest.fixture(scope='function') def tweet_emotion_train_test_textdata_sampled(): diff --git a/tests/nlp/test_datasets.py b/tests/nlp/test_datasets.py index 03051576bb..3d985c270b 100644 --- a/tests/nlp/test_datasets.py +++ b/tests/nlp/test_datasets.py @@ -12,7 +12,7 @@ import numpy as np from hamcrest import assert_that, contains_exactly, equal_to -from deepchecks.nlp.datasets.classification import tweet_emotion +from deepchecks.nlp.datasets.classification import tweet_emotion, just_dance_comment_analysis def test_tweet_emotion(): @@ -47,3 +47,17 @@ def test_tweet_emotion(): assert_that(embeddings.shape, contains_exactly(4653, 1536)) assert_that(train_embeddings.shape, contains_exactly(2675, 1536)) assert_that(test_embeddings.shape, contains_exactly(1978, 1536)) + + +def test_just_dance_comment_analysis(): + # Arrange + train, test = just_dance_comment_analysis.load_data(data_format='Dataframe', as_train_test=True) + full = just_dance_comment_analysis.load_data(data_format='Dataframe', as_train_test=False) + full_ds = just_dance_comment_analysis.load_data(data_format='TextData', as_train_test=False) + + # Act & Assert + assert_that(len(train) + len(test), equal_to(len(full))) + assert_that(train.columns, contains_exactly(*test.columns)) + assert_that(train.columns, contains_exactly(*full.columns)) + + assert_that(len(full_ds.text), equal_to(len(full))) From a08cb72747a4201283a4e653c40893e20510cf74 Mon Sep 17 00:00:00 2001 From: Nir Hutnik <92314933+nirhutnik@users.noreply.github.com> Date: Mon, 15 May 2023 16:17:29 +0300 Subject: [PATCH 09/20] Added support of token classification to PredictionDrift & LabelDrift (#2533) --- .../model_evaluation/prediction_drift.py | 47 +++++++++------- .../train_test_validation/label_drift.py | 11 ++-- deepchecks/nlp/suites/default_suites.py | 7 +-- .../utils/distribution/preprocessing.py | 3 +- .../model_evaluation/prediction_drift_test.py | 54 ++++++++++++++++++- .../train_test_validation/label_drift_test.py | 14 +++++ 6 files changed, 109 insertions(+), 27 deletions(-) diff --git a/deepchecks/nlp/checks/model_evaluation/prediction_drift.py b/deepchecks/nlp/checks/model_evaluation/prediction_drift.py index 92276d5ae5..b8170ce5e0 100644 --- a/deepchecks/nlp/checks/model_evaluation/prediction_drift.py +++ b/deepchecks/nlp/checks/model_evaluation/prediction_drift.py @@ -9,18 +9,19 @@ # ---------------------------------------------------------------------------- # """Module contains Prediction Drift check.""" +import warnings import numpy as np from deepchecks.core import CheckResult from deepchecks.core.errors import DeepchecksValueError from deepchecks.nlp import Context, TrainTestCheck +from deepchecks.nlp.task_type import TaskType from deepchecks.utils.abstracts.prediction_drift import PredictionDriftAbstract +from deepchecks.utils.distribution.preprocessing import convert_multi_label_to_multi_class __all__ = ['PredictionDrift'] -from deepchecks.utils.distribution.preprocessing import convert_multi_label_to_multi_class - class PredictionDrift(PredictionDriftAbstract, TrainTestCheck): """ @@ -56,6 +57,8 @@ class PredictionDrift(PredictionDriftAbstract, TrainTestCheck): the predicted probability of the positive class if binary. Set to 'proba' to force drift on the predicted probabilities, and 'prediction' to force drift on the predicted classes. If set to 'proba', on a multiclass task, drift would be calculated on each class independently. + For token classification tasks, drift is always calculated on the predictions and not on the probabilities, + and this parameter is ignored. margin_quantile_filter: float, default: 0.025 float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered out of the EMD calculation. This is done in order for extreme values not to affect the calculation @@ -143,28 +146,34 @@ def run_logic(self, context: Context) -> CheckResult: value: drift score. display: prediction distribution graph, comparing the train and test distributions. """ - context.raise_if_token_classification_task(self) - train_dataset = context.train.sample(self.n_samples, random_state=context.random_state) test_dataset = context.test.sample(self.n_samples, random_state=context.random_state) model = context.model - # Flag for computing drift on the probabilities rather than the predicted labels - proba_drift = ((len(context.model_classes) == 2) and (self.drift_mode == 'auto')) or \ - (self.drift_mode == 'proba') - model_classes = context.model_classes - - if proba_drift: - if context.is_multi_label_task(): - raise DeepchecksValueError('Cannot use proba drift mode for multi-label tasks') - train_prediction = np.array(model.predict_proba(train_dataset)) - test_prediction = np.array(model.predict_proba(test_dataset)) - elif context.is_multi_label_task(): - train_prediction = convert_multi_label_to_multi_class(model.predict(train_dataset), model_classes) - test_prediction = convert_multi_label_to_multi_class(model.predict(test_dataset), model_classes) + if self.drift_mode == 'proba' and \ + (context.task_type == TaskType.TOKEN_CLASSIFICATION or context.is_multi_label_task()): + warnings.warn('Cannot use drift_mode="proba" for multi-label text classification tasks or token ' + 'classification tasks. Using drift_mode="prediction" instead.', UserWarning) + + if context.task_type == TaskType.TOKEN_CLASSIFICATION: + train_prediction = np.concatenate(model.predict(train_dataset)).reshape(-1, 1) + test_prediction = np.concatenate(model.predict(test_dataset)).reshape(-1, 1) + proba_drift = False else: - train_prediction = np.array(model.predict(train_dataset)).reshape((-1, 1)) - test_prediction = np.array(model.predict(test_dataset)).reshape((-1, 1)) + # Flag for computing drift on the probabilities rather than the predicted labels + proba_drift = ((len(context.model_classes) == 2) and (self.drift_mode == 'auto')) or \ + (self.drift_mode == 'proba') + + if proba_drift: + train_prediction = np.array(model.predict_proba(train_dataset)) + test_prediction = np.array(model.predict_proba(test_dataset)) + elif context.is_multi_label_task(): + model_classes = context.model_classes + train_prediction = convert_multi_label_to_multi_class(model.predict(train_dataset), model_classes) + test_prediction = convert_multi_label_to_multi_class(model.predict(test_dataset), model_classes) + else: + train_prediction = np.array(model.predict(train_dataset)).reshape((-1, 1)) + test_prediction = np.array(model.predict(test_dataset)).reshape((-1, 1)) return self._prediction_drift(train_prediction, test_prediction, context.model_classes, context.with_display, proba_drift, not proba_drift) diff --git a/deepchecks/nlp/checks/train_test_validation/label_drift.py b/deepchecks/nlp/checks/train_test_validation/label_drift.py index bb4693ba4f..4646053655 100644 --- a/deepchecks/nlp/checks/train_test_validation/label_drift.py +++ b/deepchecks/nlp/checks/train_test_validation/label_drift.py @@ -10,9 +10,11 @@ # """Module contains Label Drift check.""" +import numpy as np from deepchecks.core import CheckResult from deepchecks.nlp import Context, TrainTestCheck +from deepchecks.nlp.task_type import TaskType from deepchecks.utils.abstracts.label_drift import LabelDriftAbstract from deepchecks.utils.distribution.preprocessing import convert_multi_label_to_multi_class @@ -114,12 +116,15 @@ def run_logic(self, context: Context) -> CheckResult: train_dataset = context.train.sample(self.n_samples, random_state=self.random_state) test_dataset = context.test.sample(self.n_samples, random_state=self.random_state) - if context.is_multi_label_task(): + if context.task_type == TaskType.TOKEN_CLASSIFICATION: + train_labels = np.concatenate(train_dataset.label) + test_labels = np.concatenate(test_dataset.label) + elif context.is_multi_label_task(): train_labels = convert_multi_label_to_multi_class(train_dataset.label, context.model_classes).flatten() test_labels = convert_multi_label_to_multi_class(test_dataset.label, context.model_classes).flatten() else: train_labels = train_dataset.label test_labels = test_dataset.label - return self._calculate_label_drift(train_labels, test_labels, 'Label', - 'categorical', context.with_display, (train_dataset.name, test_dataset.name)) + return self._calculate_label_drift(train_labels, test_labels, 'Label', 'categorical', context.with_display, + (train_dataset.name, test_dataset.name)) diff --git a/deepchecks/nlp/suites/default_suites.py b/deepchecks/nlp/suites/default_suites.py index 41691df982..c4bdb49aa0 100644 --- a/deepchecks/nlp/suites/default_suites.py +++ b/deepchecks/nlp/suites/default_suites.py @@ -18,9 +18,9 @@ from deepchecks.nlp import Suite from deepchecks.nlp.checks import (ConflictingLabels, LabelDrift, MetadataSegmentsPerformance, PredictionDrift, PropertyDrift, PropertyLabelCorrelation, PropertySegmentsPerformance, - SpecialCharacters, TextDuplicates, TextPropertyOutliers, TrainTestPerformance, - TrainTestSamplesMix, UnderAnnotatedMetaDataSegments, UnderAnnotatedPropertySegments, - UnknownTokens) + SpecialCharacters, TextDuplicates, TextEmbeddingsDrift, TextPropertyOutliers, + TrainTestPerformance, TrainTestSamplesMix, UnderAnnotatedMetaDataSegments, + UnderAnnotatedPropertySegments, UnknownTokens) __all__ = ['data_integrity', 'train_test_validation', 'model_evaluation', 'full_suite'] @@ -148,6 +148,7 @@ def model_evaluation(n_samples: int = None, PredictionDrift(**kwargs).add_condition_drift_score_less_than(), PropertySegmentsPerformance(**kwargs).add_condition_segments_relative_performance_greater_than(), MetadataSegmentsPerformance(**kwargs).add_condition_segments_relative_performance_greater_than(), + TextEmbeddingsDrift().add_condition_overall_drift_value_less_than() ) diff --git a/deepchecks/utils/distribution/preprocessing.py b/deepchecks/utils/distribution/preprocessing.py index 607defbc78..65e84e1b8d 100644 --- a/deepchecks/utils/distribution/preprocessing.py +++ b/deepchecks/utils/distribution/preprocessing.py @@ -30,7 +30,8 @@ from deepchecks.utils.distribution.rare_category_encoder import RareCategoryEncoder from deepchecks.utils.typing import Hashable -__all__ = ['ScaledNumerics', 'preprocess_2_cat_cols_to_same_bins', 'value_frequency'] +__all__ = ['ScaledNumerics', 'preprocess_2_cat_cols_to_same_bins', 'value_frequency', + 'convert_multi_label_to_multi_class'] OTHER_CATEGORY_NAME = 'Other rare categories' diff --git a/tests/nlp/checks/model_evaluation/prediction_drift_test.py b/tests/nlp/checks/model_evaluation/prediction_drift_test.py index df19466b91..19bb229fc6 100644 --- a/tests/nlp/checks/model_evaluation/prediction_drift_test.py +++ b/tests/nlp/checks/model_evaluation/prediction_drift_test.py @@ -10,7 +10,8 @@ # """Test for the NLP PredictionDrift check""" import numpy as np -from hamcrest import assert_that, close_to, equal_to, has_items +import pytest +from hamcrest import assert_that, close_to, equal_to, has_items, has_length from deepchecks.nlp import TextData from deepchecks.nlp.checks import PredictionDrift @@ -95,3 +96,54 @@ def test_just_dance_small_drift(just_dance_train_test_textdata_sampled): )) assert_that(result.value['Drift score'], close_to(0.05, 0.01)) + + +def test_token_classification(small_wikiann_train_test_text_data): + # Arrange + train, test = small_wikiann_train_test_text_data + check = PredictionDrift() + + # Act + result = check.run(train, test, train_predictions=np.asarray(train.label), + test_predictions=np.asarray(test.label)) + + # Assert + assert_that(result.value['Drift score'], close_to(0, 0.01)) + + +def test_token_classification_with_nones(small_wikiann_train_test_text_data): + # Arrange + train, test = small_wikiann_train_test_text_data + train_label_with_nones = train.label + train_label_with_nones[0][0] = None + train = TextData(train.text, tokenized_text=train.tokenized_text, + task_type='token_classification') + check = PredictionDrift() + + # Act + result = check.run(train, test, train_predictions=np.asarray(train_label_with_nones), + test_predictions=np.asarray(test.label)) + + # Assert + assert_that(result.value['Drift score'], close_to(0, 0.01)) + + +def test_drift_mode_proba_warnings(small_wikiann_train_test_text_data): + # Arrange + train, test = small_wikiann_train_test_text_data + check = PredictionDrift(drift_mode='proba') + + # Act + with pytest.warns(UserWarning, + match='Cannot use drift_mode="proba" for multi-label text classification tasks or token ' + 'classification tasks. Using drift_mode="prediction" instead.'): + check.run(train, test, train_predictions=np.asarray(train.label), test_predictions=np.asarray(test.label)) + + # Make sure doesn't raise alert regularly: + check = PredictionDrift() + + with pytest.warns(None) as record: + check.run(train, test, train_predictions=np.asarray(train.label), test_predictions=np.asarray(test.label)) + + assert_that(record, has_length(0)) + diff --git a/tests/nlp/checks/train_test_validation/label_drift_test.py b/tests/nlp/checks/train_test_validation/label_drift_test.py index 1f0d23eb01..c31d39941d 100644 --- a/tests/nlp/checks/train_test_validation/label_drift_test.py +++ b/tests/nlp/checks/train_test_validation/label_drift_test.py @@ -85,3 +85,17 @@ def test_multi_label_without_drift(dummy_multilabel_textdata_train_test): name='Label drift score < 0.15') )) assert_that(result.value['Drift score'], close_to(0.02, 0.01)) + + +def test_token_classification(small_wikiann_train_test_text_data): + # Arrange + train, test = small_wikiann_train_test_text_data + + # Act + check = LabelDrift() + result = check.run(train, test) + + # Assert + assert_that(result.value['Drift score'], close_to(0.01, 0.01)) + + From 7f889928ae56180b895e4088f94707b9aa124382 Mon Sep 17 00:00:00 2001 From: Nir Hutnik <92314933+nirhutnik@users.noreply.github.com> Date: Mon, 15 May 2023 17:06:38 +0300 Subject: [PATCH 10/20] Updated label display in NLP (#2532) --- .../text_embeddings_drift.py | 1 + deepchecks/nlp/text_data.py | 34 +++++++++++++++---- .../multivariate_embeddings_drift_utils.py | 9 ++--- deepchecks/nlp/utils/nlp_plot.py | 7 ++-- tests/nlp/test_text_data.py | 30 ++++++++++++++++ 5 files changed, 68 insertions(+), 13 deletions(-) diff --git a/deepchecks/nlp/checks/train_test_validation/text_embeddings_drift.py b/deepchecks/nlp/checks/train_test_validation/text_embeddings_drift.py index 15022a470e..973e5ca01b 100644 --- a/deepchecks/nlp/checks/train_test_validation/text_embeddings_drift.py +++ b/deepchecks/nlp/checks/train_test_validation/text_embeddings_drift.py @@ -105,6 +105,7 @@ def run_logic(self, context: Context) -> CheckResult: num_samples_in_display=self.num_samples_in_display, dimension_reduction_method=self.dimension_reduction_method, with_display=context.with_display, + model_classes=context.model_classes ) return CheckResult(value=values_dict, display=displays, header='Embeddings Drift') diff --git a/deepchecks/nlp/text_data.py b/deepchecks/nlp/text_data.py index 7ab1112fb0..dac7ebc97e 100644 --- a/deepchecks/nlp/text_data.py +++ b/deepchecks/nlp/text_data.py @@ -512,16 +512,23 @@ def label(self) -> TTextLabel: 'to run the requested functionalities') return self._label - @property - def label_for_display(self) -> TTextLabel: - """Return the label defined in the dataset. + def label_for_display(self, model_classes: list = None) -> TTextLabel: + """Return the label defined in the dataset in a format that can be displayed. + + Parameters + ---------- + model_classes : list, default None + List of classes names to use for multi-label display. Only used if the dataset is multi-label. Returns ------- TTextLabel """ if self.is_multi_label_classification(): - return [np.argwhere(x == 1).flatten().tolist() for x in self.label] + ret_labels = [np.argwhere(x == 1).flatten().tolist() for x in self.label] + if model_classes: + ret_labels = [[model_classes[i] for i in x] for x in ret_labels] + return ret_labels else: return self.label @@ -588,13 +595,26 @@ def validate_textdata_compatibility(self, other_text_data: 'TextData') -> bool: return True - def head(self, n_samples: int = 5) -> pd.DataFrame: - """Return a copy of the dataset as a pandas Dataframe with the first n_samples samples.""" + def head(self, n_samples: int = 5, model_classes: list = None) -> pd.DataFrame: + """Return a copy of the dataset as a pandas Dataframe with the first n_samples samples. + + Parameters + ---------- + n_samples : int, default 5 + Number of samples to return. + model_classes : list, default None + List of classes names to use for multi-label display. Only used if the dataset is multi-label. + + Returns + ------- + pd.DataFrame + A copy of the dataset as a pandas Dataframe with the first n_samples samples. + """ if n_samples > len(self): n_samples = len(self) - 1 result = pd.DataFrame({'text': self.text[:n_samples]}, index=self.get_original_text_indexes()[:n_samples]) if self.has_label(): - result['label'] = self.label_for_display[:n_samples] + result['label'] = self.label_for_display(model_classes=model_classes)[:n_samples] if self._tokenized_text is not None: result['tokenized_text'] = self.tokenized_text[:n_samples] if self._metadata is not None: diff --git a/deepchecks/nlp/utils/multivariate_embeddings_drift_utils.py b/deepchecks/nlp/utils/multivariate_embeddings_drift_utils.py index b6fd009cbd..b78f70b26d 100644 --- a/deepchecks/nlp/utils/multivariate_embeddings_drift_utils.py +++ b/deepchecks/nlp/utils/multivariate_embeddings_drift_utils.py @@ -29,7 +29,7 @@ def run_multivariable_drift_for_embeddings(train_dataset: TextData, test_dataset: TextData, sample_size: int, random_state: int, test_size: float, num_samples_in_display: int, dimension_reduction_method: str, - with_display: bool): + model_classes: list, with_display: bool): """Calculate multivariable drift on embeddings.""" np.random.seed(random_state) @@ -99,14 +99,15 @@ def run_multivariable_drift_for_embeddings(train_dataset: TextData, test_dataset displays = [build_drift_plot(drift_score), display_embeddings(train_dataset=train_dataset_for_display, test_dataset=test_dataset_for_display, - random_state=random_state)] + random_state=random_state, + model_classes=model_classes)] else: displays = None return values_dict, displays -def display_embeddings(train_dataset: TextData, test_dataset: TextData, random_state: int): +def display_embeddings(train_dataset: TextData, test_dataset: TextData, random_state: int, model_classes: list): """Display the embeddings with the domain classifier proba as the x-axis and the embeddings as the y-axis.""" embeddings = np.concatenate([train_dataset.embeddings, test_dataset.embeddings]) @@ -120,4 +121,4 @@ def display_embeddings(train_dataset: TextData, test_dataset: TextData, random_s y_axis_title: reduced_embeddings[:, 1]}) plot_title = 'Scatter Plot of Embeddings Space (reduced to 2 dimensions)' return two_datasets_scatter_plot(plot_title=plot_title, plot_data=plot_data, train_dataset=train_dataset, - test_dataset=test_dataset) + test_dataset=test_dataset, model_classes=model_classes) diff --git a/deepchecks/nlp/utils/nlp_plot.py b/deepchecks/nlp/utils/nlp_plot.py index 40baefe238..271b506e4f 100644 --- a/deepchecks/nlp/utils/nlp_plot.py +++ b/deepchecks/nlp/utils/nlp_plot.py @@ -236,7 +236,7 @@ def get_text_outliers_graph(dist: Sequence, data: Sequence[str], lower_limit: fl def two_datasets_scatter_plot(plot_title: str, plot_data: pd.DataFrame, train_dataset: TextData, - test_dataset: TextData): + test_dataset: TextData, model_classes: list): """Plot a scatter plot of two datasets. Parameters @@ -249,6 +249,8 @@ def two_datasets_scatter_plot(plot_title: str, plot_data: pd.DataFrame, train_da The train dataset. test_dataset : TextData The test dataset. + model_classes : list + The names of the model classes (relevant only if the datasets are multi-label). """ axes = plot_data.columns if train_dataset.name and test_dataset.name: @@ -258,7 +260,8 @@ def two_datasets_scatter_plot(plot_title: str, plot_data: pd.DataFrame, train_da plot_data['Dataset'] = [dataset_names[0]] * len(train_dataset) + [dataset_names[1]] * len(test_dataset) if train_dataset.has_label(): - plot_data['Label'] = np.concatenate([train_dataset.label_for_display, test_dataset.label_for_display]) + plot_data['Label'] = list(train_dataset.label_for_display(model_classes=model_classes)) + \ + list(test_dataset.label_for_display(model_classes=model_classes)) else: plot_data['Label'] = None plot_data['Sample'] = np.concatenate([train_dataset.text, test_dataset.text]) diff --git a/tests/nlp/test_text_data.py b/tests/nlp/test_text_data.py index 4ecfd1e0ef..2c0b7dafa6 100644 --- a/tests/nlp/test_text_data.py +++ b/tests/nlp/test_text_data.py @@ -147,6 +147,36 @@ def test_head_functionality(): assert_that(list(result.index), contains_exactly(0, 1)) +def test_label_for_display(): + # Arrange + text = ['a', 'b b b', 'c c c c'] + single_label = ['PER', 'ORG', 'GEO'] + multi_label = np.array([[1, 0, 1], [0, 1, 0], [1, 1, 1]]) + + # Act + dataset = TextData(raw_text=text, task_type='text_classification', label=single_label) + result = dataset.label_for_display() + + # Assert + assert_that(len(result), equal_to(3)) + assert_that(result, contains_exactly('PER', 'ORG', 'GEO')) + + # Act + dataset = TextData(raw_text=text, task_type='text_classification', label=multi_label) + result = dataset.label_for_display() + + # Assert + assert_that(len(result), equal_to(3)) + assert_that(result[0], contains_exactly(0, 2)) + + # Act + result = dataset.label_for_display(model_classes=['PER', 'ORG', 'GEO']) + + # Assert + assert_that(len(result), equal_to(3)) + assert_that(result[0], contains_exactly('PER', 'GEO')) + + def test_properties(text_classification_dataset_mock): # Arrange dataset = text_classification_dataset_mock From 90fff26fe3808ad37aca08c695329a3651317821 Mon Sep 17 00:00:00 2001 From: Yurii Romanyshyn <71635444+yromanyshyn@users.noreply.github.com> Date: Mon, 15 May 2023 17:16:50 +0300 Subject: [PATCH 11/20] [DEE-417] do not calculate english-only text properties for not english samples (#2525) * text properties calculation reactoring * code style fixes * code style fixes * code style fixes * tests fix * additional tests; small fixes; * tests fixes; code style fixes; * tests fixes * makefile: win-test target fix * return np.nan in case of classifier failure * small fix * makefile fix * small fix * added progress bar * modifed "gpu-tests" CI/CD job * modified gpu-tests job * gpu-tests job fix * nope; * vision gpu tests refactoring * Update deepchecks/nlp/utils/text_properties.py Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com> * modified test-win makefile target; * small fix * makefile fix --------- Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com> --- .github/workflows/build.yml | 2 +- .../train_test_validation/property_drift.py | 2 - deepchecks/nlp/utils/text_properties.py | 272 +++++++++++++----- makefile | 24 +- .../under_annotated_segments_test.py | 15 +- .../property_drift_test.py | 52 ++-- tests/nlp/conftest.py | 2 +- tests/nlp/test_datasets.py | 2 +- tests/nlp/test_text_data.py | 18 +- tests/nlp/utils/test_properties.py | 40 ++- 10 files changed, 309 insertions(+), 120 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0d2ba8ad2c..e9b6df7b6a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -157,7 +157,7 @@ jobs: - name: Set Up Env run: make env - name: Run Tests - run: make test args=tests/vision/gpu_tests + run: make vision-gpu-tests # documentation-check: diff --git a/deepchecks/nlp/checks/train_test_validation/property_drift.py b/deepchecks/nlp/checks/train_test_validation/property_drift.py index 9b9fa0218f..8c211c84ff 100644 --- a/deepchecks/nlp/checks/train_test_validation/property_drift.py +++ b/deepchecks/nlp/checks/train_test_validation/property_drift.py @@ -22,8 +22,6 @@ __all__ = ['PropertyDrift'] -# TODO: -# refactor, separate general drift logic into separate class/module and use it with drift checks class PropertyDrift(TrainTestCheck, FeatureDriftAbstract): """ Calculate drift between train dataset and test dataset per feature, using statistical measures. diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py index 1de6e1679e..912550d4be 100644 --- a/deepchecks/nlp/utils/text_properties.py +++ b/deepchecks/nlp/utils/text_properties.py @@ -13,7 +13,7 @@ import pathlib import string import warnings -from typing import Dict, List, Optional, Sequence, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union import numpy as np import pandas as pd @@ -22,9 +22,11 @@ from nltk import corpus from nltk import download as nltk_download from nltk import sent_tokenize, word_tokenize +from typing_extensions import TypedDict from deepchecks.nlp.utils.text import remove_punctuation from deepchecks.utils.function import run_available_kwargs +from deepchecks.utils.ipython import create_progress_bar __all__ = ['calculate_default_properties'] @@ -201,29 +203,38 @@ def percentage_special_characters(raw_text: Sequence[str]) -> List[float]: def max_word_length(raw_text: Sequence[str]) -> List[int]: """Return list of integers of max word length.""" - return [max([len(word) for word in text.split()]) for text in raw_text] + result = [] + for text in raw_text: + words = text.split() + if not words: + result.append(np.nan) + result.append(max(len(w) for w in words)) + return result -def language(raw_text: Sequence[str], - models_storage: Union[pathlib.Path, str, None] = None, - lang_certainty_threshold: float = 0.8 - ) -> List[str]: +def language( + raw_text: Sequence[str], + models_storage: Union[pathlib.Path, str, None] = None, + lang_certainty_threshold: float = 0.8 +) -> List[str]: """Return list of strings of language.""" fasttext = _import_optional_property_dependency(module='fasttext', property_name='language') model_name = FASTTEXT_LANG_MODEL.rsplit('/', maxsplit=1)[-1] - model_path = get_creat_model_storage(models_storage) model_path = model_path / 'fasttext' + if not model_path.exists(): model_path.mkdir(parents=True) + model_path = model_path / model_name # Save the model to a file if not model_path.exists(): - response = requests.get(FASTTEXT_LANG_MODEL) - with open(model_path, 'wb') as f: - f.write(response.content) + response = requests.get(FASTTEXT_LANG_MODEL, timeout=240) + if response.status_code != 200: + raise RuntimeError('Failed to donwload fasttext model') + model_path.write_bytes(response.content) # This weird code is to suppress a warning from fasttext about a deprecated function try: @@ -233,10 +244,17 @@ def language(raw_text: Sequence[str], raise exp # Predictions are the first prediction (k=1), only if the probability is above the threshold - predictions = model.predict(list(raw_text), k=1, threshold=lang_certainty_threshold) - - # x is empty for detection below threshold - language_codes = [x[0].replace('__label__', '') if x else np.nan for x in predictions[0]] + predictions = [ + model.predict(it.replace('\n', ' '), k=1, threshold=lang_certainty_threshold) + if it is not None + else (None, None) + for it in raw_text + ] + # labels is empty for detection below threshold + language_codes = [ + labels[0].replace('__label__', '') if labels else None + for labels, _ in predictions + ] return language_codes @@ -251,6 +269,30 @@ def subjectivity(raw_text: Sequence[str]) -> List[str]: return [textblob.TextBlob(text).sentiment.subjectivity for text in raw_text] +def _predict(text, classifier, kind): + try: + v = classifier(text) + except Exception: # pylint: disable=broad-except + return np.nan + else: + if not v: + return np.nan + v = v[0] + if kind == 'toxicity': + return v['score'] + elif kind == 'fluency': + label_value = 'LABEL_1' + elif kind == 'fluency': + label_value = 'formal' + else: + raise ValueError('Unssuported value for "kind" parameter') + return ( + v['score'] + if v['label'] == label_value + else 1 - v['score'] + ) + + def toxicity( raw_text: Sequence[str], device: Optional[int] = None, @@ -264,7 +306,10 @@ def toxicity( device=device, models_storage=models_storage ) - return [x['score'] for x in classifier(raw_text)] + return [ + _predict(text, classifier, 'toxicity') + for text in raw_text + ] def fluency( @@ -280,7 +325,10 @@ def fluency( device=device, models_storage=models_storage ) - return [x['score'] if x['label'] == 'LABEL_1' else 1 - x['score'] for x in classifier(raw_text)] + return [ + _predict(text, classifier, 'fluency') + for text in raw_text + ] def formality( @@ -296,7 +344,10 @@ def formality( device=device, models_storage=models_storage ) - return [x['score'] if x['label'] == 'formal' else 1 - x['score'] for x in classifier(raw_text)] + return [ + _predict(text, classifier, 'formality') + for text in raw_text + ] def lexical_density(raw_text: Sequence[str]) -> List[str]: @@ -324,7 +375,7 @@ def lexical_density(raw_text: Sequence[str]) -> List[str]: return result -def unique_noun_count(raw_text: Sequence[str]) -> List[str]: +def unique_noun_count(raw_text: Sequence[str]) -> List[float]: """Return a list of integers of number of unique noun words in the text.""" if not nltk_download('averaged_perceptron_tagger', quiet=True): warnings.warn('nltk averaged_perceptron_tagger not found, unique noun count cannot be calculated.' @@ -340,7 +391,7 @@ def unique_noun_count(raw_text: Sequence[str]) -> List[str]: return result -def readability_score(raw_text: Sequence[str]) -> List[str]: +def readability_score(raw_text: Sequence[str]) -> List[float]: """Return a list of floats of Flesch Reading-Ease score per text sample. In the Flesch reading-ease test, higher scores indicate material that is easier to read @@ -376,7 +427,7 @@ def readability_score(raw_text: Sequence[str]) -> List[str]: return result -def average_sentence_length(raw_text: Sequence[str]) -> List[str]: +def average_sentence_length(raw_text: Sequence[str]) -> List[float]: """Return a list of floats denoting the average sentence length per text sample.""" if not nltk_download('punkt', quiet=True): warnings.warn('nltk punkt not found, average sentence length cannot be calculated.' @@ -397,7 +448,13 @@ def average_sentence_length(raw_text: Sequence[str]) -> List[str]: return result -DEFAULT_PROPERTIES = ( +class TextProperty(TypedDict): + name: str + method: Callable[..., Sequence[Any]] + output_type: str + + +DEFAULT_PROPERTIES: Tuple[TextProperty, ...] = ( {'name': 'Text Length', 'method': text_length, 'output_type': 'numeric'}, {'name': 'Average Word Length', 'method': average_word_length, 'output_type': 'numeric'}, {'name': 'Max Word Length', 'method': max_word_length, 'output_type': 'numeric'}, @@ -414,30 +471,60 @@ def average_sentence_length(raw_text: Sequence[str]) -> List[str]: {'name': 'Average Sentence Length', 'method': average_sentence_length, 'output_type': 'numeric'}, ) -LONG_RUN_PROPERTIES = ['Toxicity', 'Fluency', 'Formality', 'Unique Noun Count'] -ENGLISH_ONLY_PROPERTIES = ['Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality'] + +LONG_RUN_PROPERTIES = ('Toxicity', 'Fluency', 'Formality', 'Unique Noun Count') LARGE_SAMPLE_SIZE = 10_000 +ENGLISH_ONLY_PROPERTIES = ( + 'Sentiment', 'Subjectivity', 'Toxicity', + 'Fluency', 'Formality', 'Readability Score', + 'Unique Noun Count' +) -def _get_default_properties( - include_properties: Optional[List[str]] = None, - ignore_properties: Optional[List[str]] = None -): - """Return the default properties. - Default properties are defined here and not outside the function so not to import all the packages - if they are not needed. - """ +def _select_properties( + *, + n_of_samples: int, + include_properties: Optional[List[str]] = None, + ignore_properties: Optional[List[str]] = None, + include_long_calculation_properties: bool = False, + device: Optional[str] = None, +) -> Sequence[TextProperty]: + """Select properties.""" properties = DEFAULT_PROPERTIES - # Filter by properties or ignore_properties: if include_properties is not None and ignore_properties is not None: raise ValueError('Cannot use properties and ignore_properties parameters together.') - elif include_properties is not None: + + if include_properties is not None: properties = [prop for prop in properties if prop['name'] in include_properties] elif ignore_properties is not None: properties = [prop for prop in properties if prop['name'] not in ignore_properties] + if not include_long_calculation_properties: + return [ + prop for prop in properties + if prop['name'] not in LONG_RUN_PROPERTIES + ] + + heavy_properties = [ + prop for prop in properties + if prop['name'] in LONG_RUN_PROPERTIES + ] + + if heavy_properties and n_of_samples > LARGE_SAMPLE_SIZE: + h_prop_names = [ + prop['name'] + for prop in heavy_properties + ] + warning_message = ( + f'Calculating the properties {h_prop_names} on a large dataset may take a long time. ' + 'Consider using a smaller sample size or running this code on better hardware.' + ) + if device is None or device == 'cpu': + warning_message += ' Consider using a GPU or a similar device to run these properties.' + warnings.warn(warning_message, UserWarning) + return properties @@ -445,7 +532,7 @@ def calculate_default_properties( raw_text: Sequence[str], include_properties: Optional[List[str]] = None, ignore_properties: Optional[List[str]] = None, - include_long_calculation_properties: Optional[bool] = False, + include_long_calculation_properties: bool = False, device: Optional[str] = None, models_storage: Union[pathlib.Path, str, None] = None ) -> Tuple[Dict[str, List[float]], Dict[str, str]]: @@ -484,48 +571,101 @@ def calculate_default_properties( Dict[str, str] A dictionary with the property name as key and the property's type as value. """ - raw_text = list(raw_text) - default_text_properties = _get_default_properties( + text_properties = _select_properties( include_properties=include_properties, - ignore_properties=ignore_properties + ignore_properties=ignore_properties, + device=device, + include_long_calculation_properties=include_long_calculation_properties, + n_of_samples=len(raw_text) ) + properties_types = { + it['name']: it['output_type'] + for it in text_properties + } - if not include_long_calculation_properties: - default_text_properties = [ - prop for prop in default_text_properties - if prop['name'] not in LONG_RUN_PROPERTIES - ] - else: # Check if the run may take a long time and warn - heavy_properties = [prop for prop in default_text_properties if prop['name'] in LONG_RUN_PROPERTIES] - if heavy_properties and len(raw_text) > LARGE_SAMPLE_SIZE: - h_prop_names = [prop['name'] for prop in heavy_properties] - warning_message = f'Calculating the properties {h_prop_names} on a large dataset may take a long time.' \ - f' Consider using a smaller sample size or running this code on better hardware.' - if device is None or device == 'cpu': - warning_message += ' Consider using a GPU or a similar device to run these properties.' + kwargs = dict(device=device, models_storage=models_storage) + english_properties_names = set(ENGLISH_ONLY_PROPERTIES) + text_properties_names = {it['name'] for it in text_properties} + samples_language = None + english_samples = [] + english_samples_mask = [] + calculated_properties = {} - warnings.warn(warning_message, UserWarning) + if english_properties_names & text_properties_names: + samples_language = run_available_kwargs( + language, + raw_text=raw_text, + **kwargs + ) - calculated_properties = {} - for prop in default_text_properties: - try: - calculated_properties[prop['name']] = run_available_kwargs( - prop['method'], - raw_text=raw_text, - device=device, - models_storage=models_storage - ) - except ImportError as e: - warnings.warn(f'Failed to calculate property {prop["name"]}.\nError: {e}') + for lang, text in zip(samples_language, raw_text): + if lang == 'en': + english_samples.append(text) + english_samples_mask.append(True) + else: + english_samples_mask.append(False) + + new_text_properties = [] + + for prop in text_properties: + if prop['name'] == 'Language': + calculated_properties['Language'] = samples_language + else: + new_text_properties.append(prop) + + text_properties = new_text_properties + + warning_message = ( + 'Failed to calculate property {0}. ' + 'Dependencies required by property are not installed. ' + 'Error:\n{1}' + ) + + progress_bar = create_progress_bar( + iterable=list(text_properties), + name='Text Properties Calculation', + unit='Text Property' + ) + + # TODO: refactor + for prop in progress_bar: + progress_bar.set_postfix( + {'Property': prop['name']}, + refresh=False + ) + if prop['name'] not in english_properties_names: + try: + values = run_available_kwargs(prop['method'], raw_text=raw_text, **kwargs) + except ImportError as e: + warnings.warn(warning_message.format(prop['name'], str(e))) + continue + else: + calculated_properties[prop['name']] = values + else: + try: + values = run_available_kwargs(prop['method'], raw_text=english_samples, **kwargs) + except ImportError as e: + warnings.warn(warning_message.format(prop['name'], str(e))) + continue + else: + result = [] + idx = 0 + fill_value = np.nan if prop['output_type'] == 'numeric' else None + for mask in english_samples_mask: + if mask: + result.append(values[idx]) + idx += 1 + else: + result.append(fill_value) + calculated_properties[prop['name']] = result if not calculated_properties: raise RuntimeError('Failed to calculate any of the properties.') - # TODO: Add tests properties_types = { - prop['name']: prop['output_type'] - for prop in default_text_properties - if prop['name'] in calculated_properties + k: v + for k, v in properties_types.items() + if k in calculated_properties } return calculated_properties, properties_types diff --git a/makefile b/makefile index 0ee5383554..1369ecd817 100644 --- a/makefile +++ b/makefile @@ -172,18 +172,24 @@ vision-torch-tf-setup: env @$(PIP) install -q "tensorflow-hub==0.12.0"; -nlp-tests-setup: env - @echo "#### installing nlp properties packages #### " - $(PIP) install -q "langdetect>=1.0.9" "textblob>=0.17.1"; - -requirements: vision-torch-tf-setup nlp-tests-setup +requirements: vision-torch-tf-setup @echo "#### installing dependencies, it could take some time, please wait! #### " @$(PIP) install -U pip @$(PIP) install wheel setuptools setuptools_scm @$(PIP) install -q \ -r $(REQUIRE_DIR)/$(REQUIRE_FILE) \ -r $(REQUIRE_DIR)/vision-$(REQUIRE_FILE) \ - -r $(REQUIRE_DIR)/nlp-$(REQUIRE_FILE) + -r $(REQUIRE_DIR)/nlp-$(REQUIRE_FILE) \ + -r $(REQUIRE_DIR)/nlp-prop-$(REQUIRE_FILE) + @$(PIP) install --no-deps -e . + +vision-requirements: vision-torch-tf-setup + @echo "#### installing dependencies, it could take some time, please wait! #### " + @$(PIP) install -U pip + @$(PIP) install wheel setuptools setuptools_scm + @$(PIP) install -q \ + -r $(REQUIRE_DIR)/$(REQUIRE_FILE) \ + -r $(REQUIRE_DIR)/vision-$(REQUIRE_FILE) @$(PIP) install --no-deps -e . doc-requirements: $(ENV) @@ -227,13 +233,16 @@ test: requirements dev-requirements fi; +vision-gpu-tests: vision-requirements dev-requirements + $(PYTEST) $(TESTDIR)/vision/gpu_tests + + test-win: @test -d $(WIN_ENV) || python -m venv $(WIN_ENV) @$(WIN_ENV)\Scripts\activate.bat $(PIP_WIN) install -q\ "torch==1.10.2+cpu" "torchvision==0.11.3+cpu" \ -f https://s3.amazonaws.com/pytorch/whl/torch_stable.html; - @$(PIP_WIN) install -q "langdetect>=1.0.9" "textblob>=0.17.1"; @$(PIP_WIN) install -q "tensorflow-hub==0.12.0"; @$(PIP_WIN) install -q "tensorflow==2.11.0"; @$(PIP_WIN) install -U pip @@ -241,6 +250,7 @@ test-win: -r $(REQUIRE_DIR)/$(REQUIRE_FILE) \ -r $(REQUIRE_DIR)/vision-$(REQUIRE_FILE) \ -r $(REQUIRE_DIR)/nlp-$(REQUIRE_FILE) \ + -r $(REQUIRE_DIR)/nlp-prop-$(REQUIRE_FILE) \ -r $(REQUIRE_DIR)/dev-$(REQUIRE_FILE) @$(PIP_WIN) install -e . python -m pytest -vvv $(WIN_TESTDIR) diff --git a/tests/nlp/checks/data_integrity/under_annotated_segments_test.py b/tests/nlp/checks/data_integrity/under_annotated_segments_test.py index 3c88b13f91..f5a45657c1 100644 --- a/tests/nlp/checks/data_integrity/under_annotated_segments_test.py +++ b/tests/nlp/checks/data_integrity/under_annotated_segments_test.py @@ -112,16 +112,17 @@ def test_token_classification_dataset(small_wikiann_train_test_text_data): # Assert assert_that(condition_result, has_items( - equal_condition_result(is_pass=False, - details='Found a segment with annotation ratio of 0.2 in comparison to an ' - 'average score of 0.8 in sampled data.', - name='The relative performance of weakest segment is greater than 80% of average model ' - 'performance.') + equal_condition_result( + is_pass=False, + details='Found a segment with annotation ratio of 0.375 in comparison to an ' + 'average score of 0.8 in sampled data.', + name='The relative performance of weakest segment is greater than 80% of average model ' + 'performance.') )) assert_that(result.value['avg_score'], close_to(0.8, 0.001)) - assert_that(len(result.value['weak_segments_list']), equal_to(25)) - assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.2, 0.01)) + assert_that(len(result.value['weak_segments_list']), equal_to(22)) + assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.375, 0.01)) def test_multilabel_dataset(multilabel_mock_dataset_and_probabilities): diff --git a/tests/nlp/checks/train_test_validation/property_drift_test.py b/tests/nlp/checks/train_test_validation/property_drift_test.py index 0de5e5589e..9f76d97f35 100644 --- a/tests/nlp/checks/train_test_validation/property_drift_test.py +++ b/tests/nlp/checks/train_test_validation/property_drift_test.py @@ -47,8 +47,8 @@ def test_without_drift(self, tweet_emotion_train_test_textdata): def test_with_drift(self, tweet_emotion_train_test_textdata): # Arrange train, test = tweet_emotion_train_test_textdata - train = train.sample(20, random_state=0) - test = test.sample(20, random_state=0) + train = train.sample(30, random_state=0) + test = test.sample(30, random_state=0) train.calculate_default_properties() test.calculate_default_properties() @@ -62,30 +62,29 @@ def test_with_drift(self, tweet_emotion_train_test_textdata): # Assert assert len(condition_results) == 1 assert condition_results[0].is_pass() is False - assert_that(result.value, has_entries({ "Subjectivity": { - "Drift score": 0.15000000000000002, + "Drift score": 0.14333333333333337, "Method": "Kolmogorov-Smirnov", "Importance": None}, "Average Word Length": { - "Drift score": 0.4, + "Drift score": 0.16666666666666666, "Method": "Kolmogorov-Smirnov", "Importance": None}, "Text Length": { - "Drift score": 0.19999999999999996, + "Drift score": 0.13333333333333333, "Method": "Kolmogorov-Smirnov", "Importance": None}, "Max Word Length": { - "Drift score": 0.19999999999999996, + "Drift score": 0.13333333333333341, "Method": "Kolmogorov-Smirnov", "Importance": None}, "% Special Characters": { - "Drift score": 0.19999999999999996, + "Drift score": 0.23333333333333334, "Method": "Kolmogorov-Smirnov", "Importance": None}, "Sentiment": { - "Drift score": 0.15000000000000002, + "Drift score": 0.1133333333333334, "Method": "Kolmogorov-Smirnov", "Importance": None}, })) # type: ignore @@ -108,9 +107,9 @@ def test_without_drift(self, small_wikiann_train_test_text_data): assert_that(result.value, has_entries({ 'Text Length': has_entries({'Drift score': 0.0, 'Method': 'Kolmogorov-Smirnov'}), '% Special Characters': has_entries({'Drift score': 0.0, 'Method': 'Kolmogorov-Smirnov'}), - 'Sentiment': has_entries({'Drift score': 0.0, 'Method': 'Kolmogorov-Smirnov'}), + 'Sentiment': has_entries({'Drift score': None, 'Method': None}), 'Average Word Length': has_entries({'Drift score': 0.0, 'Method': 'Kolmogorov-Smirnov'}), - 'Subjectivity': has_entries({'Drift score': 0.0, 'Method': 'Kolmogorov-Smirnov'}), + 'Subjectivity': has_entries({'Drift score': None, 'Method': None}), 'Max Word Length': has_entries({'Drift score': 0.0, 'Method': 'Kolmogorov-Smirnov'}) })) # type: ignore @@ -125,7 +124,7 @@ def test_with_drift(self, small_wikiann_train_test_text_data): include_long_calculation_properties=False ) - check = PropertyDrift(min_samples=20).add_condition_drift_score_less_than() + check = PropertyDrift(min_samples=40).add_condition_drift_score_less_than() # Act result = check.run(train_dataset=train, test_dataset=test) @@ -140,8 +139,8 @@ def test_with_drift(self, small_wikiann_train_test_text_data): 'Average Word Length': has_entries({'Drift score': 0.1, 'Method': 'Kolmogorov-Smirnov'}), '% Special Characters': has_entries({'Drift score': 0.16000000000000003, 'Method': 'Kolmogorov-Smirnov'}), 'Text Length': has_entries({'Drift score': 0.30000000000000004, 'Method': 'Kolmogorov-Smirnov'}), - 'Subjectivity': has_entries({'Drift score': 0.14, 'Method': 'Kolmogorov-Smirnov'}), - 'Sentiment': has_entries({'Drift score': 0.08000000000000007, 'Method': 'Kolmogorov-Smirnov'}) + 'Subjectivity': has_entries({'Drift score': None, 'Method': None}), + 'Sentiment': has_entries({'Drift score': None, 'Method': None}) })) # type: ignore @@ -174,15 +173,24 @@ def test_with_drift(self, dummy_multilabel_textdata_train_test): properties_to_ignore = ['Lexical Density','Unique Noun Count', 'Average Sentence Length', 'Readability Score'] train.calculate_default_properties(ignore_properties=properties_to_ignore) test.calculate_default_properties(ignore_properties=properties_to_ignore) - check = PropertyDrift(min_samples=20).add_condition_drift_score_less_than(max_allowed_numeric_score=0.3, - max_allowed_categorical_score=0.3) + + check = PropertyDrift(min_samples=20).add_condition_drift_score_less_than( + max_allowed_numeric_score=0.3, + max_allowed_categorical_score=0.3 + ) + # Act result = check.run(train_dataset=train, test_dataset=test) condition_results = check.conditions_decision(result) + assert_that(condition_results, has_items( - equal_condition_result(is_pass=False, - details="Failed for 1 out of 6 columns.\nFound 1 " - "numeric columns with Kolmogorov-Smirnov above threshold: " - "{'Text Length': '0.33'}", - name='categorical drift score < 0.3 and numerical drift score < 0.3') - )) + equal_condition_result( + is_pass=False, + details=( + "Failed for 1 out of 7 columns.\nFound 1 " + "numeric columns with Kolmogorov-Smirnov above threshold: " + "{'Text Length': '0.33'}"), + name=( + 'categorical drift score < 0.3 and numerical drift score < 0.3') + ) + )) # type: ignore diff --git a/tests/nlp/conftest.py b/tests/nlp/conftest.py index fe348a384c..27172aa3b2 100644 --- a/tests/nlp/conftest.py +++ b/tests/nlp/conftest.py @@ -19,7 +19,7 @@ from nltk import download as nltk_download from nltk.corpus import movie_reviews -from deepchecks.nlp.datasets.classification import tweet_emotion, just_dance_comment_analysis +from deepchecks.nlp.datasets.classification import just_dance_comment_analysis, tweet_emotion from deepchecks.nlp.text_data import TextData diff --git a/tests/nlp/test_datasets.py b/tests/nlp/test_datasets.py index 3d985c270b..05b96c3612 100644 --- a/tests/nlp/test_datasets.py +++ b/tests/nlp/test_datasets.py @@ -12,7 +12,7 @@ import numpy as np from hamcrest import assert_that, contains_exactly, equal_to -from deepchecks.nlp.datasets.classification import tweet_emotion, just_dance_comment_analysis +from deepchecks.nlp.datasets.classification import just_dance_comment_analysis, tweet_emotion def test_tweet_emotion(): diff --git a/tests/nlp/test_text_data.py b/tests/nlp/test_text_data.py index 2c0b7dafa6..799a0eaca7 100644 --- a/tests/nlp/test_text_data.py +++ b/tests/nlp/test_text_data.py @@ -184,15 +184,19 @@ def test_properties(text_classification_dataset_mock): # Act & Assert assert_that(dataset._properties, equal_to(None)) # TODO: Create test for the heavy properties - dataset.calculate_default_properties(ignore_properties=['topic'] + LONG_RUN_PROPERTIES) + dataset.calculate_default_properties(ignore_properties=['topic', *LONG_RUN_PROPERTIES]) properties = dataset.properties assert_that(properties.shape[0], equal_to(3)) - assert_that(properties.shape[1], equal_to(9)) - assert_that(properties.columns, - contains_exactly('Text Length', 'Average Word Length', 'Max Word Length', '% Special Characters', - 'Sentiment', 'Subjectivity', 'Lexical Density', 'Readability Score', - 'Average Sentence Length')) - assert_that(properties.iloc[0].values, contains_exactly(22, 3.6, 9, 0.0, 0.0, 0.0, 80.0, 100.24, 5)) + assert_that(properties.shape[1], equal_to(10)) + assert_that(properties.columns, contains_exactly( + 'Language','Text Length', 'Average Word Length', + 'Max Word Length', '% Special Characters', 'Sentiment', + 'Subjectivity', 'Lexical Density', 'Readability Score', + 'Average Sentence Length' + )) + assert_that(properties.iloc[0].values, contains_exactly( + 'en', 22, 3.6, 9, 0.0, 0.0, 0.0, 80.0, 100.24, 5 + )) def test_embeddings(): diff --git a/tests/nlp/utils/test_properties.py b/tests/nlp/utils/test_properties.py index 21099cde53..0eb1c9833f 100644 --- a/tests/nlp/utils/test_properties.py +++ b/tests/nlp/utils/test_properties.py @@ -18,6 +18,7 @@ import pytest from hamcrest import * +from deepchecks.nlp.text_data import TextData from deepchecks.nlp.utils.text_properties import MODELS_STORAGE, calculate_default_properties, get_transformer_model @@ -26,7 +27,7 @@ def mock_fn(*args, **kwargs): # pylint: disable=unused-argument @patch('deepchecks.nlp.utils.text_properties.run_available_kwargs', mock_fn) -def test_calculate_toxicity_property(): +def test_that_warning_is_shown_for_big_datasets(): # Arrange raw_text = ['This is a test sentence.'] * 20_000 @@ -41,7 +42,7 @@ def test_calculate_toxicity_property(): include_long_calculation_properties=True)[0] # Assert - assert_that(result, equal_to({'Toxicity': [0] * 20_000})) + assert len(result['Toxicity']) == len(raw_text) def test_calculate_lexical_density_property(tweet_emotion_train_test_textdata): @@ -72,12 +73,11 @@ def test_calculate_unique_noun_count_property(tweet_emotion_train_test_textdata) include_long_calculation_properties=True)[0] # Assert - assert_that(result['Unique Noun Count'][0: 10], equal_to([9, 2, 3, 3, 4, 10, 4, 2, 7, 5])) + assert_that(result['Unique Noun Count'][0: 10], equal_to([9, 2, 3, 3, 4, 10, np.nan, 2, 7, 5])) assert_that(result_none_text['Unique Noun Count'], equal_to([np.nan])) def test_calculate_average_sentence_length_property(tweet_emotion_train_test_textdata): - # Arrange _, test = tweet_emotion_train_test_textdata test_text = test.text @@ -92,7 +92,6 @@ def test_calculate_average_sentence_length_property(tweet_emotion_train_test_tex def test_calculate_readability_score_property(tweet_emotion_train_test_textdata): - # Arrange _, test = tweet_emotion_train_test_textdata test_text = test.text @@ -102,7 +101,9 @@ def test_calculate_readability_score_property(tweet_emotion_train_test_textdata) result_none_text = calculate_default_properties([None], include_properties=['Readability Score'])[0] # Assert - assert_that(result['Readability Score'][0: 10], equal_to([102.045, 97.001, 80.306, 67.755, 77.103, 71.782, 90.99, 75.5, 70.102, 95.564])) + assert_that(result['Readability Score'][0: 10], equal_to([ + 102.045, 97.001, 80.306, 67.755, 77.103, 71.782, np.nan, 75.5, 70.102, 95.564 + ])) assert_that(result_none_text['Readability Score'], equal_to([np.nan])) @@ -170,3 +171,30 @@ def test_properties_models_download_into_provided_directory(): assert MODELS_STORAGE.exists() and MODELS_STORAGE.is_dir() assert model_path.exists() and model_path.is_dir() assert onnx_model_path.exists() and onnx_model_path.is_dir() + + +def test_english_only_properties_calculation_with_not_english_samples(): + # Arrange + text = [ + 'Explicit is better than implicit', + 'Сьогодні чудова погода', + 'London is the capital of Great Britain' + ] + # Act + properties, properties_types = calculate_default_properties( + raw_text=text, + include_properties=['Sentiment', 'Language', 'Text Length'] + ) + # Assert + assert_that(properties, has_entries({ + 'Sentiment': contains_exactly(close_to(0.5, 0.01), same_instance(np.nan), close_to(0.8, 0.01)), + 'Language': contains_exactly('en', 'uk', 'en'), + 'Text Length': contains_exactly(*[len(it) for it in text]), + })) # type: ignore + assert_that(properties_types, has_entries({ + 'Sentiment': 'numeric', + 'Language': 'categorical', + 'Text Length': 'numeric', + })) # type: ignore + + From a2b9fcec004f6afbb0960536cdafa776b15bbe14 Mon Sep 17 00:00:00 2001 From: shir22 <33841818+shir22@users.noreply.github.com> Date: Mon, 15 May 2023 19:05:40 +0300 Subject: [PATCH 12/20] [DOCS] Add JMLR Paper reference to CITATION.cff (#2486) --- CITATION.cff | 102 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 75 insertions(+), 27 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index 94e4a8a766..1c026e060b 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,49 +1,97 @@ cff-version: 1.2.0 -title: >- - Deepchecks: A Library for Testing and Validating - Machine Learning Models and Data -message: >- - If you use this software, please cite it using the - metadata from this file. +title: "Deepchecks: A Library for Testing and Validating Machine Learning Models and Data" +message: "If you use this software, please cite it using the metadata from this file." type: software authors: - - given-names: Shir - family-names: Chorev +- family-names: Chorev + given-names: Shir + email: shir@deepchecks.com + affiliation: Deepchecks Ltd. +- family-names: Tannor + given-names: Philip + email: philip@deepchecks.com + affiliation: Deepchecks Ltd. +- family-names: Ben Israel + given-names: Dan + email: danb@deepchecks.com + affiliation: Deepchecks Ltd. +- family-names: Bressler + given-names: Noam + email: noam@deepchecks.com + affiliation: Deepchecks Ltd. +- family-names: Gabbay + given-names: Itay + email: itay@deepchecks.com + affiliation: Deepchecks Ltd. +- family-names: Hutnik + given-names: Nir + email: nir@deepchecks.com + affiliation: Deepchecks Ltd. +- family-names: Liberman + given-names: Jonatan + email: jonatan@deepchecks.com + affiliation: Deepchecks Ltd. +- family-names: Perlmutter + given-names: Matan + email: matan@deepchecks.com + affiliation: Deepchecks Ltd. +- family-names: Romanyshyn + given-names: Yurii + email: yurii@deepchecks.com + affiliation: Deepchecks Ltd. +- family-names: Rokach + given-names: Lior + email: liorrk@bgu.ac.il + affiliation: Deepchecks Ltd. and Department of Software and Info. Sys. Eng. Ben-Gurion University of the Negev +url: "https://github.com/deepchecks/deepchecks" +preferred-citation: + type: article + authors: + - family-names: Chorev + given-names: Shir email: shir@deepchecks.com affiliation: Deepchecks Ltd. - - given-names: Philip - family-names: Tannor + - family-names: Tannor + given-names: Philip email: philip@deepchecks.com affiliation: Deepchecks Ltd. - - given-names: Dan - family-names: Ben Israel + - family-names: Ben Israel + given-names: Dan email: danb@deepchecks.com affiliation: Deepchecks Ltd. - - given-names: Noam - family-names: Bressler + - family-names: Bressler + given-names: Noam email: noam@deepchecks.com affiliation: Deepchecks Ltd. - - given-names: Itay - family-names: Gabbay + - family-names: Gabbay + given-names: Itay email: itay@deepchecks.com affiliation: Deepchecks Ltd. - - given-names: Nir - family-names: Hutnik + - family-names: Hutnik + given-names: Nir email: nir@deepchecks.com affiliation: Deepchecks Ltd. - - given-names: Jonatan - family-names: Liberman + - family-names: Liberman + given-names: Jonatan email: jonatan@deepchecks.com affiliation: Deepchecks Ltd. - - given-names: Matan - family-names: Perlmutter + - family-names: Perlmutter + given-names: Matan email: matan@deepchecks.com affiliation: Deepchecks Ltd. - - given-names: Yurii - family-names: Romanyshyn + - family-names: Romanyshyn + given-names: Yurii email: yurii@deepchecks.com affiliation: Deepchecks Ltd. - - given-names: Lior - family-names: Rokach - email: liorrk@bgu.ac.ail + - family-names: Rokach + given-names: Lior + email: liorrk@bgu.ac.il affiliation: Deepchecks Ltd. and Department of Software and Info. Sys. Eng. Ben-Gurion University of the Negev + title: "Deepchecks: A Library for Testing and Validating Machine Learning Models and Data" + journal: Journal of Machine Learning Research + year: 2022 + volume: 23 + number: 265 + start: 1 + end: 6 + url: "http://jmlr.org/papers/v23/22-0281.html" From 32fb4630f2996b4241d41585c065d5816f309817 Mon Sep 17 00:00:00 2001 From: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com> Date: Tue, 16 May 2023 10:32:31 +0300 Subject: [PATCH 13/20] Extended just dance dataset + small fixes (#2538) --- .../weak_segments_performance.py | 45 +++-- deepchecks/nlp/context.py | 9 +- .../just_dance_comment_analysis.py | 169 ++++++++++++++++-- deepchecks/nlp/utils/text_embeddings.py | 2 +- deepchecks/nlp/utils/text_properties.py | 2 +- .../weak_segments_performance.py | 4 +- .../model_evaluation/prediction_drift_test.py | 4 +- .../weak_segment_performance_test.py | 14 +- .../train_test_validation/label_drift_test.py | 4 +- tests/nlp/conftest.py | 4 +- tests/nlp/test_datasets.py | 23 ++- 11 files changed, 218 insertions(+), 62 deletions(-) diff --git a/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py b/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py index b02d4c01d7..2de089e03c 100644 --- a/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py +++ b/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py @@ -19,7 +19,6 @@ from deepchecks.core.check_result import DisplayMap from deepchecks.core.errors import DeepchecksNotSupportedError, DeepchecksProcessError from deepchecks.nlp import Context, SingleDatasetCheck -from deepchecks.nlp.task_type import TaskType from deepchecks.nlp.utils.weak_segments import get_relevant_data_table from deepchecks.tabular.context import _DummyModel from deepchecks.utils.abstracts.weak_segment_abstract import WeakSegmentAbstract @@ -34,7 +33,7 @@ class WeakSegmentsAbstractText(SingleDatasetCheck, WeakSegmentAbstract): def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], None], ignore_columns: Union[Hashable, List[Hashable], None], n_top_features: Optional[int], - segment_minimum_size_ratio: float, alternative_scorer: Dict[str, Callable], + segment_minimum_size_ratio: float, alternative_scorer: Dict[str, Union[str, Callable]], score_per_sample: Union[np.ndarray, pd.Series, None], n_samples: int, categorical_aggregation_threshold: float, n_to_show: int, **kwargs): super().__init__(**kwargs) @@ -63,6 +62,10 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult: # Decide which scorer and score_per_sample to use in the algorithm run is_multilabel = text_data.is_multi_label_classification() if is_multilabel: + if self.alternative_scorer is None: + self.alternative_scorer = {'F1 Macro': 'f1_macro'} + + # TODO: make weak segments work with multilabel directly without reducing to single dimension # For multilabel, we reduce the label to a single dimension using TruncatedSVD, which is better in handling # dimensionality reduction of sparse matrices label = TruncatedSVD(1).fit_transform(text_data.label).squeeze() @@ -80,19 +83,15 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult: avg_score = round(score_per_sample.mean(), 3) else: predictions = context.model.predict(text_data) - if context.task_type == TaskType.TEXT_CLASSIFICATION: - if not hasattr(context.model, 'predict_proba'): - raise DeepchecksNotSupportedError( - 'Predicted probabilities not supplied. The weak segment checks relies' - ' on cross entropy error that requires predicted probabilities, ' - 'rather than only predicted classes.') - y_proba = context.model.predict_proba(text_data) - score_per_sample = calculate_neg_cross_entropy_per_sample(text_data.label, np.asarray(y_proba), - is_multilabel=is_multilabel, - model_classes=context.model_classes) - else: - raise DeepchecksNotSupportedError('Weak segments performance check is not supported for ' - f'{context.task_type}.') + if not hasattr(context.model, 'predict_proba'): + raise DeepchecksNotSupportedError( + 'Predicted probabilities not supplied. The weak segment checks relies' + ' on cross entropy error that requires predicted probabilities, ' + 'rather than only predicted classes.') + y_proba = context.model.predict_proba(text_data) + score_per_sample = calculate_neg_cross_entropy_per_sample(text_data.label, np.asarray(y_proba), + is_multilabel=is_multilabel, + model_classes=context.model_classes) dummy_model = _DummyModel(test=encoded_dataset, y_pred_test=predictions, y_proba_test=y_proba, validate_data_on_predict=False) scorer = context.get_single_scorer(self.alternative_scorer) @@ -150,7 +149,7 @@ class PropertySegmentsPerformance(WeakSegmentsAbstractText): segment_minimum_size_ratio: float , default: 0.05 Minimum size ratio for segments. Will only search for segments of size >= segment_minimum_size_ratio * data_size. - alternative_scorer : Tuple[str, Union[str, Callable]] , default: None + alternative_scorer : Dict[str, Union[str, Callable]] , default: None Scorer to use as performance measure, either function or sklearn scorer name. If None, a default scorer (per the model type) will be used. score_per_sample: Optional[np.array, pd.Series, None], default: None @@ -158,7 +157,7 @@ class PropertySegmentsPerformance(WeakSegmentsAbstractText): a higher score mean better model performance on that sample. If provided, the check will also use provided score per sample as a scoring function for segments. if None the check calculates score per sample by via neg cross entropy for classification. - n_samples : int , default: 10_000 + n_samples : int , default: 5_000 Maximum number of samples to use for this check. n_to_show : int , default: 3 number of segments with the weakest performance to show. @@ -171,9 +170,9 @@ def __init__(self, ignore_properties: Union[Hashable, List[Hashable], None] = None, n_top_properties: Optional[int] = 15, segment_minimum_size_ratio: float = 0.05, - alternative_scorer: Dict[str, Callable] = None, + alternative_scorer: Dict[str, Union[str, Callable]] = None, score_per_sample: Union[np.ndarray, pd.Series, None] = None, - n_samples: int = 10_000, + n_samples: int = 5_000, categorical_aggregation_threshold: float = 0.05, n_to_show: int = 3, **kwargs): @@ -216,7 +215,7 @@ class MetadataSegmentsPerformance(WeakSegmentsAbstractText): segment_minimum_size_ratio: float , default: 0.05 Minimum size ratio for segments. Will only search for segments of size >= segment_minimum_size_ratio * data_size. - alternative_scorer : Tuple[str, Union[str, Callable]] , default: None + alternative_scorer : Dict[str, Union[str, Callable]] , default: None Scorer to use as performance measure, either function or sklearn scorer name. If None, a default scorer (per the model type) will be used. score_per_sample: Union[np.array, pd.Series, None], default: None @@ -224,7 +223,7 @@ class MetadataSegmentsPerformance(WeakSegmentsAbstractText): a higher score mean better model performance on that sample. If provided, the check will also use provided score per sample as a scoring function for segments. if None the check calculates score per sample by via neg cross entropy for classification. - n_samples : int , default: 10_000 + n_samples : int , default: 5_000 Maximum number of samples to use for this check. n_to_show : int , default: 3 number of segments with the weakest performance to show. @@ -237,9 +236,9 @@ def __init__(self, ignore_columns: Union[Hashable, List[Hashable], None] = None, n_top_columns: Optional[int] = 15, segment_minimum_size_ratio: float = 0.05, - alternative_scorer: Dict[str, Callable] = None, + alternative_scorer: Dict[str, Union[str, Callable]] = None, score_per_sample: Union[np.ndarray, pd.Series, None] = None, - n_samples: int = 10_000, + n_samples: int = 5_000, categorical_aggregation_threshold: float = 0.05, n_to_show: int = 3, **kwargs): diff --git a/deepchecks/nlp/context.py b/deepchecks/nlp/context.py index 4dfb33f835..4fe330bdcf 100644 --- a/deepchecks/nlp/context.py +++ b/deepchecks/nlp/context.py @@ -449,16 +449,16 @@ def get_scorers(self, return init_validate_scorers(scorers, self.model_classes, self._observed_classes) def get_single_scorer(self, - scorer: t.Mapping[str, t.Union[str, t.Callable]] = None, + scorer: t.Union[t.Mapping[str, t.Union[str, t.Callable]], t.List[str], None] = None, use_avg_defaults=True) -> DeepcheckScorer: """Return initialized & validated scorer if provided or a default scorer otherwise. Parameters ---------- - scorer : Union[List[str], Dict[str, Union[str, Callable]]], default: None + scorer : t.Union[t.Mapping[str, t.Union[str, t.Callable]], t.List[str], None], default: None List of scorers to use. If None, use default scorers. Scorers can be supplied as a list of scorer names or as a dictionary of names and functions. - use_avg_defaults : bool, default True + use_avg_defaults : bool, default: True If no scorers were provided, for classification, determines whether to use default scorers that return an averaged metric, or default scorers that return a metric per class. Returns @@ -466,7 +466,4 @@ def get_single_scorer(self, List[DeepcheckScorer] An initialized & validated scorer. """ - if scorer is not None: - scorer_name = next(iter(scorer)) - scorer = {scorer_name: scorer[scorer_name]} return self.get_scorers(scorer, use_avg_defaults)[0] diff --git a/deepchecks/nlp/datasets/classification/just_dance_comment_analysis.py b/deepchecks/nlp/datasets/classification/just_dance_comment_analysis.py index c918556ea9..490931e43f 100644 --- a/deepchecks/nlp/datasets/classification/just_dance_comment_analysis.py +++ b/deepchecks/nlp/datasets/classification/just_dance_comment_analysis.py @@ -28,6 +28,7 @@ import pathlib import typing as t +import numpy as np import pandas as pd from deepchecks.nlp import TextData @@ -35,9 +36,11 @@ __all__ = ['load_data'] - _FULL_DATA_URL = 'https://figshare.com/ndownloader/files/40564895' - +_SHORT_DATA_URL = 'https://figshare.com/ndownloader/files/40576232' +_SHORT_PROPERTIES_URL = 'https://figshare.com/ndownloader/files/40580693' +_SHORT_EMBEDDINGS_URL = 'https://figshare.com/ndownloader/files/40576328' +_SHORT_PROBAS_URL = 'https://figshare.com/ndownloader/files/40578866' ASSETS_DIR = pathlib.Path(__file__).absolute().parent.parent / 'assets' / 'just_dance_comment_analysis' @@ -45,9 +48,113 @@ _CAT_METADATA = [] _CAT_PROPERTIES = ['Language'] _TEXT_COL = 'originalText' +_TIME_COL = 'dateComment' +_DATE_TO_SPLIT_BY = '2015-01-01' + + +def load_precalculated_predictions(pred_format: str = 'predictions', as_train_test: bool = True, + use_full_size: bool = False) -> \ + t.Union[np.array, t.Tuple[np.array, np.array]]: + """Load and return a precalculated predictions for the dataset. + + Parameters + ---------- + pred_format : str, default: 'predictions' + Represent the format of the returned value. Can be 'predictions' or 'probabilities'. + 'predictions' will return the predicted class for each sample. + 'probabilities' will return the predicted probabilities for each sample. + as_train_test : bool, default: True + If True, the returned data is split into train and test exactly like the toy model + was trained. The first return value is the train data and the second is the test data. + Otherwise, returns a single object. + use_full_size : bool, default: False + If True, the returned data will be the full dataset, otherwise returns a subset of the data. + Returns + ------- + predictions : np.ndarray + The prediction of the data elements in the dataset. + + """ + if use_full_size: + raise NotImplementedError('Predictions for the full dataset are not yet available.') + all_preds = read_and_save_data(ASSETS_DIR, 'just_dance_probabilities.csv', _SHORT_PROBAS_URL, to_numpy=True, + file_type='npy') + + if pred_format == 'predictions': + all_preds = (np.array(all_preds) > 0.5) + all_preds = all_preds.astype(int) + elif pred_format != 'probabilities': + raise ValueError('pred_format must be either "predictions" or "probabilities"') + + if as_train_test: + train_indexes, test_indexes = _get_train_test_indexes() + return all_preds[train_indexes], all_preds[test_indexes] + else: + return all_preds + + +def load_embeddings(as_train_test: bool = True, use_full_size: bool = False) -> \ + t.Union[np.array, t.Tuple[np.array, np.array]]: + """Load and return the embeddings of the just dance dataset calculated by OpenAI. + + Parameters + ---------- + as_train_test : bool, default: True + If True, the returned data is split into train and test exactly like the toy model + was trained. The first return value is the train data and the second is the test data. + Otherwise, returns a single object. + use_full_size : bool, default: False + If True, the returned data will be the full dataset, otherwise returns a subset of the data. + + Returns + ------- + embeddings : np.ndarray + Embeddings for the just dance dataset. + """ + if use_full_size: + raise NotImplementedError('Embeddings for the full dataset are not yet available.') + + all_embeddings = read_and_save_data(ASSETS_DIR, 'just_dance_embeddings.npy', _SHORT_EMBEDDINGS_URL, + file_type='npy', to_numpy=True) + + if as_train_test: + train_indexes, test_indexes = _get_train_test_indexes(use_full_size) + return all_embeddings[train_indexes], all_embeddings[test_indexes] + else: + return all_embeddings + +def load_properties(as_train_test: bool = True, use_full_size: bool = False) -> \ + t.Union[pd.DataFrame, t.Tuple[pd.DataFrame, pd.DataFrame]]: + """Load and return the properties of the just_dance dataset. -def load_data(data_format: str = 'TextData', as_train_test: bool = True, use_full_size: bool = False) -> \ + Parameters + ---------- + as_train_test : bool, default: True + If True, the returned data is split into train and test exactly like the toy model + was trained. The first return value is the train data and the second is the test data. + In order to get this model, call the load_fitted_model() function. + Otherwise, returns a single object. + use_full_size : bool, default: False + If True, the returned data will be the full dataset, otherwise returns a subset of the data. + Returns + ------- + properties : pd.DataFrame + Properties for the just dance dataset. + """ + if use_full_size: + raise NotImplementedError('Properties for the full dataset are not yet available.') + properties = read_and_save_data(ASSETS_DIR, 'just_dance_properties.csv', _SHORT_PROPERTIES_URL, to_numpy=False) + + if as_train_test: + train_indexes, test_indexes = _get_train_test_indexes(use_full_size) + return properties.loc[train_indexes], properties.loc[test_indexes] + else: + return properties + + +def load_data(data_format: str = 'TextData', as_train_test: bool = True, use_full_size: bool = False, + include_properties: bool = True, include_embeddings: bool = False) -> \ t.Union[t.Tuple, t.Union[TextData, pd.DataFrame]]: """Load and returns the Just Dance Comment Analysis dataset (multi-label classification). @@ -64,6 +171,10 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True, use_ful Otherwise, returns a single object. use_full_size : bool, default: False If True, the returned data will be the full dataset, otherwise returns a subset of the data. + include_properties : bool, default: True + If True, the returned data will include properties of the comments. Incompatible with data_format='DataFrame' + include_embeddings : bool, default: True + If True, the returned data will include embeddings of the comments. Incompatible with data_format='DataFrame' Returns ------- @@ -75,27 +186,28 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True, use_ful if data_format.lower() not in ['textdata', 'dataframe']: raise ValueError('data_format must be either "Dataset" or "Dataframe"') - data = read_and_save_data(ASSETS_DIR, 'just_dance_data.csv', _FULL_DATA_URL, to_numpy=False) - data['dateComment'] = pd.to_datetime(data['dateComment']) + if use_full_size: + data = read_and_save_data(ASSETS_DIR, 'just_dance_data.csv', _FULL_DATA_URL, to_numpy=False) + else: + data = read_and_save_data(ASSETS_DIR, 'just_dance_shorted_data.csv', _SHORT_DATA_URL, to_numpy=False) + data[_TIME_COL] = pd.to_datetime(data[_TIME_COL]) + + properties = load_properties(as_train_test=False, use_full_size=use_full_size) if include_properties else None + embeddings = load_embeddings(as_train_test=False, use_full_size=use_full_size) if include_embeddings else None if not as_train_test: - if not use_full_size: - data = data[(data['dateComment'] < '2013-01-01') | (data['dateComment'] >= '2021-01-01')] if data_format.lower() != 'textdata': return data label = data.drop(columns=[_TEXT_COL] + _METADATA_COLS).to_numpy().astype(int) dataset = TextData(data[_TEXT_COL], label=label, task_type='text_classification', - metadata=data[_METADATA_COLS], categorical_metadata=_CAT_METADATA) + metadata=data[_METADATA_COLS], categorical_metadata=_CAT_METADATA, + properties=properties, categorical_properties=_CAT_PROPERTIES, embeddings=embeddings) return dataset else: - if use_full_size: - train = data[data['dateComment'] < '2015-01-01'] - test = data[data['dateComment'] >= '2015-01-01'] - else: - train = data[data['dateComment'] < '2013-01-01'] - test = data[data['dateComment'] >= '2021-01-01'] + train_indexes, test_indexes = _get_train_test_indexes(use_full_size) + train, test = data.loc[train_indexes], data.loc[test_indexes] if data_format.lower() != 'textdata': return train, test @@ -104,9 +216,34 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True, use_ful label_train = train.drop(columns=[_TEXT_COL] + _METADATA_COLS).to_numpy().astype(int) label_test = test.drop(columns=[_TEXT_COL] + _METADATA_COLS).to_numpy().astype(int) + if include_properties: + train_properties, test_properties = properties.loc[train.index], properties.loc[test.index] + else: + train_properties, test_properties = None, None + if include_embeddings: + train_embeddings = embeddings[train.index] # pylint: disable=unsubscriptable-object + test_embeddings = embeddings[test.index] # pylint: disable=unsubscriptable-object + else: + train_embeddings, test_embeddings = None, None + train_ds = TextData(train[_TEXT_COL], label=label_train, task_type='text_classification', - metadata=train_metadata, categorical_metadata=_CAT_METADATA) + metadata=train_metadata, categorical_metadata=_CAT_METADATA, + properties=train_properties, categorical_properties=_CAT_PROPERTIES, + embeddings=train_embeddings) test_ds = TextData(test[_TEXT_COL], label=label_test, task_type='text_classification', - metadata=test_metadata, categorical_metadata=_CAT_METADATA) + metadata=test_metadata, categorical_metadata=_CAT_METADATA, + properties=test_properties, categorical_properties=_CAT_PROPERTIES, + embeddings=test_embeddings) return train_ds, test_ds + + +def _get_train_test_indexes(use_full_size: bool = False) -> t.Tuple[np.array, np.array]: + """Get the indexes of the train and test sets.""" + if use_full_size: + dataset = pd.read_csv(ASSETS_DIR / 'just_dance_data.csv', usecols=[_TIME_COL]) + else: + dataset = pd.read_csv(ASSETS_DIR / 'just_dance_shorted_data.csv', usecols=[_TIME_COL]) + train_indexes = dataset[dataset[_TIME_COL] < _DATE_TO_SPLIT_BY].index + test_indexes = dataset[dataset[_TIME_COL] >= _DATE_TO_SPLIT_BY].index + return train_indexes, test_indexes diff --git a/deepchecks/nlp/utils/text_embeddings.py b/deepchecks/nlp/utils/text_embeddings.py index 14d9599010..de88aca5dc 100644 --- a/deepchecks/nlp/utils/text_embeddings.py +++ b/deepchecks/nlp/utils/text_embeddings.py @@ -70,7 +70,7 @@ def _get_embedding_with_backoff(list_of_strings): embeddings.append(x['embedding']) else: raise ValueError(f'Unknown model type: {model}') - embeddings = np.array(embeddings) + embeddings = np.array(embeddings).astype(np.float16) if file_path is not None: np.save(file_path, embeddings) return embeddings diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py index 912550d4be..89ebaac8f2 100644 --- a/deepchecks/nlp/utils/text_properties.py +++ b/deepchecks/nlp/utils/text_properties.py @@ -282,7 +282,7 @@ def _predict(text, classifier, kind): return v['score'] elif kind == 'fluency': label_value = 'LABEL_1' - elif kind == 'fluency': + elif kind == 'formality': label_value = 'formal' else: raise ValueError('Unssuported value for "kind" parameter') diff --git a/deepchecks/tabular/checks/model_evaluation/weak_segments_performance.py b/deepchecks/tabular/checks/model_evaluation/weak_segments_performance.py index 8faf8ee062..e8ba634848 100644 --- a/deepchecks/tabular/checks/model_evaluation/weak_segments_performance.py +++ b/deepchecks/tabular/checks/model_evaluation/weak_segments_performance.py @@ -54,7 +54,7 @@ class WeakSegmentsPerformance(SingleDatasetCheck, WeakSegmentAbstract): segment_minimum_size_ratio: float , default: 0.05 Minimum size ratio for segments. Will only search for segments of size >= segment_minimum_size_ratio * data_size. - alternative_scorer : Tuple[str, Union[str, Callable]] , default: None + alternative_scorer : Dict[str, Union[str, Callable]] , default: None Scorer to use as performance measure, either function or sklearn scorer name. If None, a default scorer (per the model type) will be used. score_per_sample: Union[np.array, pd.Series, None], default: None @@ -81,7 +81,7 @@ def __init__( ignore_columns: Union[Hashable, List[Hashable], None] = None, n_top_features: int = 5, segment_minimum_size_ratio: float = 0.05, - alternative_scorer: Dict[str, Callable] = None, + alternative_scorer: Dict[str, Union[str, Callable]] = None, loss_per_sample: Union[np.ndarray, pd.Series, None] = None, score_per_sample: Union[np.ndarray, pd.Series, None] = None, n_samples: int = 10_000, diff --git a/tests/nlp/checks/model_evaluation/prediction_drift_test.py b/tests/nlp/checks/model_evaluation/prediction_drift_test.py index 19bb229fc6..162fe49313 100644 --- a/tests/nlp/checks/model_evaluation/prediction_drift_test.py +++ b/tests/nlp/checks/model_evaluation/prediction_drift_test.py @@ -91,11 +91,11 @@ def test_just_dance_small_drift(just_dance_train_test_textdata_sampled): # Assert assert_that(condition_result, has_items( equal_condition_result(is_pass=True, - details="Found model prediction Cramer's V drift score of 0.05", + details="Found model prediction Cramer's V drift score of 0.07", name='Prediction drift score < 0.1') )) - assert_that(result.value['Drift score'], close_to(0.05, 0.01)) + assert_that(result.value['Drift score'], close_to(0.07, 0.01)) def test_token_classification(small_wikiann_train_test_text_data): diff --git a/tests/nlp/checks/model_evaluation/weak_segment_performance_test.py b/tests/nlp/checks/model_evaluation/weak_segment_performance_test.py index 1a6745e4ca..5bcd525cfa 100644 --- a/tests/nlp/checks/model_evaluation/weak_segment_performance_test.py +++ b/tests/nlp/checks/model_evaluation/weak_segment_performance_test.py @@ -92,13 +92,13 @@ def test_multilabel_dataset(multilabel_mock_dataset_and_probabilities): # Assert assert_that(condition_result, has_items( - equal_condition_result(is_pass=False, - details='Found a segment with accuracy score of 0.395 in comparison to an average ' - 'score of 0.624 in sampled data.', - name='The relative performance of weakest segment is greater than 80% of average model ' - 'performance.') + equal_condition_result(is_pass=True, + details='Found a segment with f1 macro score of 0.695 in comparison to an average ' + 'score of 0.83 in sampled data.', + name='The relative performance of weakest segment is greater ' + 'than 80% of average model performance.') )) - assert_that(result.value['avg_score'], close_to(0.624, 0.001)) + assert_that(result.value['avg_score'], close_to(0.83, 0.001)) assert_that(len(result.value['weak_segments_list']), is_in([5, 6])) # TODO: check why it's not always 5 - assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.395, 0.01)) + assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.695, 0.01)) diff --git a/tests/nlp/checks/train_test_validation/label_drift_test.py b/tests/nlp/checks/train_test_validation/label_drift_test.py index c31d39941d..82af906f33 100644 --- a/tests/nlp/checks/train_test_validation/label_drift_test.py +++ b/tests/nlp/checks/train_test_validation/label_drift_test.py @@ -27,11 +27,11 @@ def test_just_dance_small_drift(just_dance_train_test_textdata_sampled): # Assert assert_that(condition_result, has_items( equal_condition_result(is_pass=True, - details="Label's drift score Cramer's V is 0.05", + details="Label's drift score Cramer's V is 0.07", name='Label drift score < 0.1') )) - assert_that(result.value['Drift score'], close_to(0.05, 0.01)) + assert_that(result.value['Drift score'], close_to(0.07, 0.01)) def test_tweet_emotion(tweet_emotion_train_test_textdata): diff --git a/tests/nlp/conftest.py b/tests/nlp/conftest.py index 27172aa3b2..64985c27f2 100644 --- a/tests/nlp/conftest.py +++ b/tests/nlp/conftest.py @@ -38,10 +38,12 @@ def tweet_emotion_train_test_textdata(): include_embeddings=True) return train, test + @pytest.fixture(scope='session') def just_dance_train_test_textdata_sampled(): """Just Dance text multilabel classification dataset""" - train, test = just_dance_comment_analysis.load_data(data_format='TextData', as_train_test=True) + train, test = just_dance_comment_analysis.load_data(data_format='TextData', as_train_test=True, + include_embeddings=True) sampled_train = train.sample(500, random_state=42) sampled_test = test.sample(500, random_state=42) return sampled_train, sampled_test diff --git a/tests/nlp/test_datasets.py b/tests/nlp/test_datasets.py index 05b96c3612..ee44afa526 100644 --- a/tests/nlp/test_datasets.py +++ b/tests/nlp/test_datasets.py @@ -53,7 +53,15 @@ def test_just_dance_comment_analysis(): # Arrange train, test = just_dance_comment_analysis.load_data(data_format='Dataframe', as_train_test=True) full = just_dance_comment_analysis.load_data(data_format='Dataframe', as_train_test=False) - full_ds = just_dance_comment_analysis.load_data(data_format='TextData', as_train_test=False) + full_ds = just_dance_comment_analysis.load_data(data_format='TextData', as_train_test=False, + include_embeddings=True) + preds = just_dance_comment_analysis.load_precalculated_predictions(pred_format='predictions', as_train_test=False) + probas = just_dance_comment_analysis.load_precalculated_predictions(pred_format='probabilities', + as_train_test=False) + properties = just_dance_comment_analysis.load_properties(as_train_test=False) + train_props, test_props = just_dance_comment_analysis.load_properties(as_train_test=True) + embeddings = just_dance_comment_analysis.load_embeddings(as_train_test=False) + train_embeddings, test_embeddings = just_dance_comment_analysis.load_embeddings(as_train_test=True) # Act & Assert assert_that(len(train) + len(test), equal_to(len(full))) @@ -61,3 +69,16 @@ def test_just_dance_comment_analysis(): assert_that(train.columns, contains_exactly(*full.columns)) assert_that(len(full_ds.text), equal_to(len(full))) + assert_that(len(full_ds.text), equal_to(len(preds))) + assert_that(len(full_ds.text), equal_to(len(probas))) + + assert_that(len(properties), equal_to(len(full))) + assert_that(len(train_props) + len(test_props), equal_to(len(full))) + assert_that(len(train_props), equal_to(len(train))) + + assert_that(len(embeddings), equal_to(len(full))) + assert_that(len(train_embeddings) + len(test_embeddings), equal_to(len(full))) + assert_that(len(train_embeddings), equal_to(len(train))) + assert_that(embeddings.shape, contains_exactly(16281, 1536)) + assert_that(train_embeddings.shape, contains_exactly(7669, 1536)) + assert_that(test_embeddings.shape, contains_exactly(8612, 1536)) From cdab6030d952fee786ef1952290c4b67a5db6e49 Mon Sep 17 00:00:00 2001 From: Nir Hutnik <92314933+nirhutnik@users.noreply.github.com> Date: Tue, 16 May 2023 19:29:45 +0300 Subject: [PATCH 14/20] Remove langdetect from requirements * Remove langdetect from requirements as we now use fasttext --------- Co-authored-by: Noam Bressler --- requirements/dev-requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements/dev-requirements.txt b/requirements/dev-requirements.txt index 16aeefba88..53c84b79b4 100644 --- a/requirements/dev-requirements.txt +++ b/requirements/dev-requirements.txt @@ -47,7 +47,6 @@ beautifulsoup4>=4.11.1 # NLP nltk<=3.6.7 datasets -langdetect textblob transformers sentence-transformers \ No newline at end of file From aaafe511ecee5ebd82dda1393ecad31ffded94ef Mon Sep 17 00:00:00 2001 From: Noam Bressler Date: Wed, 17 May 2023 23:17:10 +0300 Subject: [PATCH 15/20] Noam/dee 593 plot example for integrity checks (#2542) * Add unknown tokens check plot * add text duplicate check plot * add conflicting labels check plot * add train test sample mix check plot --- README.md | 4 +- .../data_integrity/conflicting_labels.py | 4 +- .../data_integrity/plot_conflicting_labels.py | 88 ++++++++++++++ .../data_integrity/plot_text_duplicates.py | 90 ++++++++++++++ .../nlp/data_integrity/plot_unknown_tokens.py | 114 ++++++++++++++++++ .../plot_train_test_sample_mix.py | 97 +++++++++++++++ 6 files changed, 393 insertions(+), 4 deletions(-) create mode 100644 docs/source/checks/nlp/data_integrity/plot_conflicting_labels.py create mode 100644 docs/source/checks/nlp/data_integrity/plot_text_duplicates.py create mode 100644 docs/source/checks/nlp/data_integrity/plot_unknown_tokens.py create mode 100644 docs/source/checks/nlp/train_test_validation/plot_train_test_sample_mix.py diff --git a/README.md b/README.md index fd8bf5b1d6..f6f60efe7e 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ pip install deepchecks -U --user > ``` > > To install deepchecks together with the **NLP Submodule** that -> is currently in *alpha release*, replace +> is currently in *beta release*, replace > ``deepchecks`` with ``"deepchecks[nlp]"`` as follows: > ```bash > pip install "deepchecks[nlp]" -U --user @@ -303,7 +303,7 @@ subset of the following: The package currently supports tabular data and is in: - *beta release* for the [Computer Vision subpackage](deepchecks/vision). -- *alpha release* for the [NLP subpackage](deepchecks/nlp). +- *beta release* for the [NLP subpackage](deepchecks/nlp). ## 📖 Documentation diff --git a/deepchecks/nlp/checks/data_integrity/conflicting_labels.py b/deepchecks/nlp/checks/data_integrity/conflicting_labels.py index 32ef5e257f..7d66912a6f 100644 --- a/deepchecks/nlp/checks/data_integrity/conflicting_labels.py +++ b/deepchecks/nlp/checks/data_integrity/conflicting_labels.py @@ -121,7 +121,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult: ambiguous_samples_hashes = n_of_labels_per_sample[n_of_labels_per_sample > 1] ambiguous_samples_hashes = frozenset(ambiguous_samples_hashes.index.to_list()) - ambiguous_samples = df[df['hash'].isin(ambiguous_samples_hashes)] + ambiguous_samples = df[df['hash'].isin(ambiguous_samples_hashes)].copy() num_of_ambiguous_samples = ambiguous_samples['Text'].count() percent_of_ambiguous_samples = num_of_ambiguous_samples / n_of_samples @@ -138,7 +138,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult: if context.with_display is False or num_of_ambiguous_samples == 0: return CheckResult(value=result_value) - ambiguous_samples['Text'] = ambiguous_samples['Text'].apply(self._truncate_text) + ambiguous_samples.loc[:, 'Text'] = ambiguous_samples['Text'].apply(self._truncate_text) by_hash = ambiguous_samples.groupby(['hash'], dropna=False) observed_labels = by_hash['Label'].aggregate(lambda x: format_list(x.to_list())) samples_ids = by_hash['Sample ID'].aggregate(lambda x: format_list(x.to_list(), max_string_length=200)) diff --git a/docs/source/checks/nlp/data_integrity/plot_conflicting_labels.py b/docs/source/checks/nlp/data_integrity/plot_conflicting_labels.py new file mode 100644 index 0000000000..6fe1c3306f --- /dev/null +++ b/docs/source/checks/nlp/data_integrity/plot_conflicting_labels.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +""" +.. _nlp__conflicting_labels: + +Conflicting Labels +****************** + +This notebook provides an overview for using and understanding the Conflicting Labels check: + +**Structure:** + +* `Why check for conflicting labels? <#why-check-for-conflicting-labels>`__ +* `Create TextData <#create-textdata>`__ +* `Run the Check <#run-the-check>`__ +* `Define a Condition <#define-a-condition>`__ + +Why check for conflicting labels? +================================== + +The ``ConflictingLabels`` check finds identical or nearly identical (see +`text normalization <#with-text-normalization>`__) samples in the dataset that have different labels. Conflicting labels +can lead to inconsistencies and confusion for the model during training. Identifying such samples can help in cleaning +the data and improving the model's performance. + +Create TextData +=============== + +Lets create a simple dataset with some samples having conflicting labels. +""" + +from deepchecks.nlp import TextData +from deepchecks.nlp.checks import ConflictingLabels + +texts = [ + "Deep learning is a subset of machine learning.", + "Deep learning is a subset of machine learning.", + "Deep learning is a sub-set of Machine Learning.", + "Deep learning is subset of machine learning", + "Natural language processing is a subfield of AI.", + "This is a unique text sample.", + "This is another unique text.", +] + +labels = [0, 1, 1, 0, 2, 2, 2] + +dataset = TextData(texts, label=labels, task_type='text_classification') + +#%% +# Run the Check +# ============= + +# Run the check without any text normalization +ConflictingLabels( + ignore_case=False, + remove_punctuation=False, + normalize_unicode=False, + remove_stopwords=False, + ignore_whitespace=False +).run(dataset) + +# %% +# With Text Normalization +# ----------------------- +# By default, ``ConflictingLabels`` check applies text normalization before identifying the conflicting labels. +# This includes case normalization, punctuation removal, Unicode normalization and stopwords removal. +# You can also customize the normalization as per your requirements: + +ConflictingLabels( + ignore_case=True, + remove_punctuation=True, + normalize_unicode=True, + remove_stopwords=True, + ignore_whitespace=True +).run(dataset) + +# %% +# Of all the parameters in this example, ``ignore_whitespace`` is the only one set to ``False`` by default. +# +# Define a Condition +# ================== +# +# Now, we define a condition that enforces the ratio of samples with conflicting labels to be 0. A condition +# is deepchecks' way to validate model and data quality, and let you know if anything goes wrong. + +check = ConflictingLabels() +check.add_condition_ratio_of_conflicting_labels_less_or_equal(0) +result = check.run(dataset) +result.show(show_additional_outputs=False) diff --git a/docs/source/checks/nlp/data_integrity/plot_text_duplicates.py b/docs/source/checks/nlp/data_integrity/plot_text_duplicates.py new file mode 100644 index 0000000000..4fe565fe78 --- /dev/null +++ b/docs/source/checks/nlp/data_integrity/plot_text_duplicates.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- +""" +.. _nlp__data_duplicates: + +Text Data Duplicates +******************** + +This notebook provides an overview for using and understanding the text data duplicates check: + +**Structure:** + +* `Why check for text data duplicates? <#why-check-for-text-data-duplicates>`__ +* `Create TextData <#create-textdata>`__ +* `Run the Check <#run-the-check>`__ +* `Define a Condition <#define-a-condition>`__ + +Why check for text data duplicates? +=================================== + +The ``TextDuplicates`` check finds multiple instances of identical or nearly identical (see +`text normalization <#with-text-normalization>`__) samples in the +Dataset. Duplicate samples increase the weight the model gives to those samples. +If these duplicates are there intentionally (e.g. as a result of intentional +oversampling, or due to the dataset's nature it has identical-looking samples) +this may be valid, however if this is a hidden issue we're not expecting to occur, +it may be an indicator for a problem in the data pipeline that requires attention. + +Create TextData +=============== + +Let's create a simple dataset with some duplicate and similar text samples. +""" + +from deepchecks.nlp.checks import TextDuplicates +from deepchecks.nlp import TextData + +texts = [ + "Deep learning is a subset of machine learning.", + "Deep learning is a subset of machine learning.", + "Deep learning is a sub-set of Machine Learning.", + "Deep learning is subset of machine learning", + "Natural language processing is a subfield of AI.", + "This is a unique text sample.", + "This is another unique text.", +] + +dataset = TextData(texts) + +#%% +# Run the Check +# ============= + +# Run the check without any text normalization +TextDuplicates( + ignore_case=False, + remove_punctuation=False, + normalize_unicode=False, + remove_stopwords=False, + ignore_whitespace=False +).run(dataset) + +# %% +# With Text Normalization +# ----------------------- +# By default, ``TextDuplicates`` check applies text normalization before identifying the duplicates. +# This includes case normalization, punctuation removal, Unicode normalization and stopwords removal. +# You can also customize the normalization as per your requirements: + +TextDuplicates( + ignore_case=True, + remove_punctuation=True, + normalize_unicode=True, + remove_stopwords=True, + ignore_whitespace=True +).run(dataset) + +# %% +# Of all the parameters in this example, ``ignore_whitespace`` is the only one set to ``False`` by default. +# +# Define a Condition +# ================== +# +# Now, we define a condition that enforces the ratio of duplicates to be 0. A condition +# is deepchecks' way to validate model and data quality, and let you know if anything +# goes wrong. + +check = TextDuplicates() +check.add_condition_ratio_less_or_equal(0) +result = check.run(dataset) +result.show(show_additional_outputs=False) diff --git a/docs/source/checks/nlp/data_integrity/plot_unknown_tokens.py b/docs/source/checks/nlp/data_integrity/plot_unknown_tokens.py new file mode 100644 index 0000000000..bae2879a5b --- /dev/null +++ b/docs/source/checks/nlp/data_integrity/plot_unknown_tokens.py @@ -0,0 +1,114 @@ +# -*- coding: utf-8 -*- +""" + +.. _nlp__unknown_tokens: + +Unknown Tokens +************** + +This notebook provides an overview for using and understanding the Unknown Tokens check. + +**Structure:** + +* `What is the purpose of the check? <#what-is-the-purpose-of-the-check>`__ +* `Generate data & model <#generate-data-model>`__ +* `Run the check <#run-the-check>`__ +* `Using the Check Value <#using-the-check-value>`__ +* `Define a condition <#define-a-condition>`__ + +What is the purpose of the check? +================================== + +The Unknown Tokens check is designed to help you identify samples that contain tokens not supported by your tokenizer. +These not supported tokens can lead to poor model performance, as the model may not be able to understand the meaning +of such tokens. By identifying these unknown tokens, you can take appropriate action, such as updating your tokenizer +or preprocessing your data to handle them. + +Generate data & model +===================== + +In this example, we'll use the twitter dataset. + +""" + +from deepchecks.nlp.datasets.classification import tweet_emotion + +dataset, _ = tweet_emotion.load_data() + +# %% +# Run the check +# ============= +# +# The check has several key parameters that affect its behavior and output: +# +# * `tokenizer`: Tokenizer from the HuggingFace transformers library to use for tokenization. If None, +# BertTokenizer.from_pretrained('bert-base-uncased') will be used. +# * `group_singleton_words`: If True, group all words that appear only once in the data into the "Other" category in +# the display. + + +from deepchecks.nlp.checks import UnknownTokens + +check = UnknownTokens() +result = check.run(dataset) +result.show() + +# %% +# Observe the check's output +# -------------------------- +# +# We see in the results that the check found many emojis and some foreign words (Korean, can be seen by hovering +# over the "Other Unknown Words" slice of the pie chart) that are not supported by the +# tokenizer. We can also see that the check grouped all words that appear only once in the data into the "Other" +# +# Use a Different Tokenizer +# ------------------------- +# +# We can also use a different tokenizer, such as the GPT2 tokenizer, to see how the results change. + +from transformers import GPT2Tokenizer +tokenizer = GPT2Tokenizer.from_pretrained('gpt2') + +UnknownTokens(tokenizer=tokenizer).run(dataset) + +# %% +# Using the Check Value +# ===================== +# +# On top of observing the check's display, we can use the check's returned value to get more information about the +# words containing unknown tokens in our dataset. The check's value is a nested dictionary with the following keys: +# +# 1. ``unknown_word_ratio``: The ratio of unknown words out of all words in the dataset. +# 2. ``unknown_word_details``: This is in turn also a dict, containing a key for each unknown word. The value for each +# key is a dict containing 'ratio' (the ratio of the unknown word out of all words in the dataset) and 'indexes' +# (the indexes of the samples containing the unknown word). +# +# We'll show here how you can use this value to get the individual samples containing unknown tokens. + +from pprint import pprint + +unknown_word_details = result.value['unknown_word_details'] +first_unknown_word = list(unknown_word_details.keys())[0] +print(f"Unknown word: {first_unknown_word}") + +word_indexes = unknown_word_details[first_unknown_word]['indexes'] +pprint(dataset.text[word_indexes].tolist()) + +# %% +# +# As we can see, the GPT2 tokenizer supports emojis, so the check did not find any unknown tokens. +# +# Define a condition +# ================== +# +# We can add a condition that validates the ratio of unknown words in the dataset is below a certain threshold. This can +# be useful to ensure that your dataset does not have a high percentage of unknown tokens, which might negatively impact +# the performance of your model. + +check.add_condition_ratio_of_unknown_words_less_or_equal(0.005) +result = check.run(dataset) +result.show(show_additional_outputs=False) + +# %% +# In this example, the condition checks if the ratio of unknown words is less than or equal to 0.005 (0.5%). If the +# ratio is higher than the threshold, the condition will fail, indicating a potential issue with the dataset. diff --git a/docs/source/checks/nlp/train_test_validation/plot_train_test_sample_mix.py b/docs/source/checks/nlp/train_test_validation/plot_train_test_sample_mix.py new file mode 100644 index 0000000000..d963d5b6e3 --- /dev/null +++ b/docs/source/checks/nlp/train_test_validation/plot_train_test_sample_mix.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +""" +.. _nlp__train_test_samples_mix: + +Train-Test Samples Mix +************************ + +This notebook provides an overview for using and understanding the train-test samples mix check: + +**Structure:** + +* `Why check for train-test samples mix? <#why-check-for-train-test-samples-mix>`__ +* `Create TextData for Train and Test Sets <#create-textdata-for-train-and-test-sets>`__ +* `Run the Check <#run-the-check>`__ +* `Define a Condition <#define-a-condition>`__ + +Why check for train-test samples mix? +====================================== +The ``TrainTestSamplesMix`` check finds instances of identical or nearly identical (see +`text normalization <#with-text-normalization>`__) samples in both the +train and test datasets. If such samples are present unintentionally, it may lead to data leakage, which +can result in overly optimistic model performance estimates during evaluation. Identifying and addressing +such issues is crucial to ensure the model performs well on unseen data. + +Create TextData for Train and Test Sets +======================================== + +Let's create train and test datasets with some overlapping and similar text samples. +""" + +from deepchecks.nlp.checks import TrainTestSamplesMix +from deepchecks.nlp import TextData + +train_texts = [ + "Deep learning is a subset of machine learning.", + "Deep learning is a subset of machine learning.", + "Deep learning is a sub-set of Machine Learning.", + "Natural language processing is a subfield of AI.",] + +test_texts = [ + "Deep learning is a subset of machine learning.", + "Deep learning is subset of machine learning", + "Machine learning is a subfield of AI.", + "This is a unique text sample in the test set.", + "This is another unique text in the test set.", +] + +train_dataset = TextData(train_texts) +test_dataset = TextData(test_texts) + +#%% +# Run the Check +# ============= + +# Run the check without any text normalization +check = TrainTestSamplesMix( + ignore_case=False, + remove_punctuation=False, + normalize_unicode=False, + remove_stopwords=False, + ignore_whitespace=False +) +result = check.run(train_dataset, test_dataset) +result.show() + +# %% +# With Text Normalization +# ----------------------- +# +# By default, ``TrainTestSamplesMix`` check applies text normalization before identifying the duplicates. +# This includes case normalization, punctuation removal, Unicode normalization and stopwords removal. +# You can also customize the normalization as per your requirements: + +check = TrainTestSamplesMix( + ignore_case=True, + remove_punctuation=True, + normalize_unicode=True, + remove_stopwords=True, + ignore_whitespace=True +) +result = check.run(train_dataset, test_dataset) +result.show() + +# %% +# Of all the parameters in this example, ``ignore_whitespace`` is the only one set to ``False`` by default. +# +# Define a Condition +# ================== +# +# Now, we define a condition that enforces the ratio of duplicates to be 0. A condition +# is deepchecks' way to validate model and data quality, and let you know if anything +# goes wrong. + +check = TrainTestSamplesMix() +check.add_condition_duplicates_ratio_less_or_equal(0) +result = check.run(train_dataset, test_dataset) +result.show(show_additional_outputs=False) From 1baf39f1529384badc0d896710dc731ce4e3d463 Mon Sep 17 00:00:00 2001 From: Noam Bressler Date: Thu, 18 May 2023 08:47:32 +0300 Subject: [PATCH 16/20] Noam/dee 530 feat improved quickstart for nlp (#2540) * update classification tweeter quickstart --------- Co-authored-by: shir22 <33841818+shir22@users.noreply.github.com> --- .../datasets/classification/tweet_emotion.py | 23 +- deepchecks/nlp/utils/nlp_plot.py | 3 +- .../model_evaluation/plot_prediction_drift.py | 2 +- .../plot_create_a_custom_suite.py | 10 +- .../quickstarts/plot_text_classification.py | 391 +++++++++++------- 5 files changed, 277 insertions(+), 152 deletions(-) diff --git a/deepchecks/nlp/datasets/classification/tweet_emotion.py b/deepchecks/nlp/datasets/classification/tweet_emotion.py index e7f3e23d79..5cba9ae6eb 100644 --- a/deepchecks/nlp/datasets/classification/tweet_emotion.py +++ b/deepchecks/nlp/datasets/classification/tweet_emotion.py @@ -26,7 +26,7 @@ from deepchecks.nlp import TextData from deepchecks.utils.builtin_datasets_utils import read_and_save_data -__all__ = ['load_data', 'load_embeddings', 'load_precalculated_predictions'] +__all__ = ['load_data', 'load_embeddings', 'load_precalculated_predictions', 'load_under_annotated_data'] _FULL_DATA_URL = 'https://ndownloader.figshare.com/files/39486889' _EMBEDDINGS_URL = 'https://ndownloader.figshare.com/files/40564880' @@ -193,6 +193,27 @@ def load_precalculated_predictions(pred_format: str = 'predictions', as_train_te return all_preds +def load_under_annotated_data(): + """Load and return the test data, modified to have under annotated segment.""" + _, test = load_data() + test_copy = test.copy() + + # randomly remove 5% of the labels + np.random.seed(42) + idx_to_fillna = np.random.choice(range(len(test)), int(len(test) * 0.05), replace=False) + test_copy._label = test_copy._label.astype(dtype=object) # pylint: disable=protected-access + test_copy._label[idx_to_fillna] = None # pylint: disable=protected-access + + # randomly remove 40% of the under annotated segments + np.random.seed(42) + under_annotated_segment_idx = test_copy.properties[ + (test_copy.properties.Fluency < 0.4) & (test_copy.properties.Formality < 0.2)].index + idx_to_fillna = np.random.choice(under_annotated_segment_idx, int(len(under_annotated_segment_idx) * 0.4), + replace=False) + test_copy._label[idx_to_fillna] = None # pylint: disable=protected-access + return test_copy + + def _get_train_test_indexes() -> t.Tuple[np.array, np.array]: """Get the indexes of the train and test sets.""" if (ASSETS_DIR / 'tweet_emotion_data.csv').exists(): diff --git a/deepchecks/nlp/utils/nlp_plot.py b/deepchecks/nlp/utils/nlp_plot.py index 271b506e4f..6fe3f6d300 100644 --- a/deepchecks/nlp/utils/nlp_plot.py +++ b/deepchecks/nlp/utils/nlp_plot.py @@ -268,7 +268,6 @@ def two_datasets_scatter_plot(plot_title: str, plot_data: pd.DataFrame, train_da plot_data['Sample'] = plot_data['Sample'].apply(break_to_lines_and_trim) fig = px.scatter(plot_data, x=axes[0], y=axes[1], color='Dataset', color_discrete_map=colors, - hover_data=['Label', 'Sample'], hover_name='Dataset', title=plot_title, height=600, width=1000, - opacity=0.4) + hover_data=['Label', 'Sample'], hover_name='Dataset', title=plot_title, opacity=0.4) fig.update_traces(marker=dict(size=8, line=dict(width=1, color='DarkSlateGrey')), selector=dict(mode='markers')) return fig diff --git a/docs/source/checks/vision/model_evaluation/plot_prediction_drift.py b/docs/source/checks/vision/model_evaluation/plot_prediction_drift.py index 4ce6bea72e..0287d0b412 100644 --- a/docs/source/checks/vision/model_evaluation/plot_prediction_drift.py +++ b/docs/source/checks/vision/model_evaluation/plot_prediction_drift.py @@ -40,7 +40,7 @@ In computer vision specifically, our predictions may be complex, and measuring their drift is not a straightforward task. Therefore, we calculate drift on different -:ref:`properties of the prediction`, +:ref:`properties of the prediction `, on which we can directly measure drift. Which Prediction Properties Are Used? diff --git a/docs/source/general/usage/customizations/plot_create_a_custom_suite.py b/docs/source/general/usage/customizations/plot_create_a_custom_suite.py index 92b328ac06..9962f5ae98 100644 --- a/docs/source/general/usage/customizations/plot_create_a_custom_suite.py +++ b/docs/source/general/usage/customizations/plot_create_a_custom_suite.py @@ -26,8 +26,8 @@ * :doc:`API Reference ` * :ref:`Tabular checks ` -* :ref:`Vision checks -* :ref:`NLP checks +* :ref:`Vision checks ` +* :ref:`NLP checks ` * Built-in suites (by printing them to see which checks they include) """ @@ -67,9 +67,9 @@ # to use the auto-complete to see the arguments a check receive or the built-in conditions # it has, try doing it outside of the suite's initialization.* # -# *For example, to see a check's built-in conditions, type in a new cell: -# ``NameOfDesiredCheck().add_condition_`` and then check the auto-complete suggestions -# (using Shift + Tab), to discover the built-in checks.* +# * For example, to see a check's built-in conditions, type in a new cell: +# ``NameOfDesiredCheck().add_condition_`` and then check the auto-complete suggestions +# (using Shift + Tab), to discover the built-in checks.* # # Additional Notes about Conditions in a Suite # -------------------------------------------- diff --git a/docs/source/nlp/tutorials/quickstarts/plot_text_classification.py b/docs/source/nlp/tutorials/quickstarts/plot_text_classification.py index e8d57b7f5d..958d37a8b8 100644 --- a/docs/source/nlp/tutorials/quickstarts/plot_text_classification.py +++ b/docs/source/nlp/tutorials/quickstarts/plot_text_classification.py @@ -3,12 +3,23 @@ Test NLP Classification Tasks - Quickstart ****************************************** -In order to run deepchecks for NLP all you need to have are the following for both your train and test data: +In this quickstart guide, we will go over using the deepchecks NLP package to analyze and evaluate text +classification tasks. We will cover the following steps: -1. Your text data - a list of strings, each string is a single sample (can be a sentence, paragraph, document etc.). +1. `Creating a TextData object and auto calculating properties <#setting-up>`__ +2. `Running the built-in suites and inspecting the results <#running-the-deepchecks-default-suites>`__ +3. `We'll spotlight two interesting checks - Embeddings drift and Under-Annotated Segments <#running-individual-checks>`__ + +To run deepchecks for NLP, you need the following for both your train and test data: + +1. Your :ref:`text data ` - a list of strings, each string is a single sample + (can be a sentence, paragraph, document, etc.). 2. Your labels - either a :ref:`Text Classification ` label or a - :ref:`Token Classification ` label. -3. Your models predictions (see :ref:`nlp__supported_tasks` for info on supported formats). + :ref:`Token Classification ` label. These are not needed for checks that + don't require labels (such as the Embeddings Drift check or most data integrity checks), but are needed for + many other checks. +3. Your model's predictions (see :ref:`nlp__supported_tasks` for info on supported formats). These are needed only for + the model related checks, shown in the `Model Evaluation <#model-evaluation>`__ section of this guide. If you don't have deepchecks installed yet: @@ -23,206 +34,300 @@ .. code:: python import sys - !{sys.executable} -m pip install langdetect>=1.0.9 textblob>=0.17.1 -U --quiet #--user - -Finally, we'll be using the CatBoost model in this guide, so we'll also need to install it: + !{sys.executable} -m pip install [nlp-properties] -U --quiet #--user -.. code:: python +Setting Up +========== - import sys - !{sys.executable} -m pip install catboost -U --quiet #--user +Load Data +--------- +For the purpose of this guide, we'll use a small subset of the +`tweet emotion `__ dataset: """ -#%% -# Load Data & Create TextData Objects -# =================================== -# For the purpose of this guide we'll use a small subset of the -# `tweet emotion `__ dataset: - -# Imports from deepchecks.nlp import TextData from deepchecks.nlp.datasets.classification import tweet_emotion -# Load Data train, test = tweet_emotion.load_data(data_format='DataFrame') train.head() -#%% +# %% # # We can see that we have the tweet text itself, the label (the emotion) and then some additional metadata columns. # +# Create a TextData Objects +# ------------------------- +# # We can now create a :class:`TextData ` object for the train and test dataframes. # This object is used to pass your data to the deepchecks checks. # # To create a TextData object, the only required argument is the text itself, but passing only the text -# will prevent multiple checks from running. In this example we'll pass the label as well and also provide -# metadata (the other columns in the dataframe) which we'll use later on in the guide. Finally, we'll also -# explicitly set the index. -# -# .. note:: -# -# The label column is optional, but if provided you must also pass the ``task_type`` argument, so that deepchecks -# will know how to interpret the label column. -# +# will prevent multiple checks from running. In this example we'll pass the label and define the task type and finally +# define the :ref:`metadata columns ` (the other columns in the dataframe) which we'll use later +# on in the guide. + train = TextData(train.text, label=train['label'], task_type='text_classification', metadata=train.drop(columns=['label', 'text'])) test = TextData(test.text, label=test['label'], task_type='text_classification', metadata=test.drop(columns=['label', 'text'])) -#%% -# Building a Model -# ================ +# %% +# Calculating Properties +# ---------------------- # -# In this example we'll train a very basic model for simplicity, using a CatBoostClassifier trained over the -# embeddings of the tweets. In this case these embeddings were created using the OpenAI GPT-3 model. -# If you want to reproduce this kind of basic model for your own task, you can calculate your own embeddings, or use -# our :func:`calculate_embeddings_for_text ` -# function to create embeddings from a generic model. Note that in order to run it you need either an OpenAI API key or have -# HuggingFace's transformers installed. +# Some of deepchecks' checks use properties of the text samples for various calculations. Deepcheck has a wide +# variety of such properties, some simple and some that rely on external models and are more heavy to run. In order +# for deepchecks' checks to be able to access the properties, they must be stored within the +# :class:`TextData ` object. You can read more about properties in the +# :ref:`Property Guide `. + +# properties can be either calculated directly by Deepchecks +# or imported from other sources in appropriate format + +# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +# train.calculate_default_properties( +# include_long_calculation_properties=True, device=device +# ) +# test.calculate_default_properties( +# include_long_calculation_properties=True, device=device +# ) -from sklearn.metrics import roc_auc_score -from catboost import CatBoostClassifier +train_properties, test_properties = tweet_emotion.load_properties() -# Load Embeddings and Split to Train and Test -train_embeddings, test_embeddings = tweet_emotion.load_embeddings(as_train_test=True) +train.set_properties(train_properties, categorical_properties=['Language']) +test.set_properties(test_properties, categorical_properties=['Language']) -model = CatBoostClassifier(max_depth=2, n_estimators=50, random_state=42) -model.fit(train_embeddings, train.label, verbose=0) -print(roc_auc_score(test.label, model.predict_proba(test_embeddings), - multi_class="ovr", average="macro")) +train.properties.head(2) -#%% -# Running Deepchecks -# ================== +# %% +# Running the Deepchecks Default Suites +# ===================================== # -# Now that we have our data and model, we can run our first checks! We'll run two types of checks: +# Deepchecks comes with a set of pre-built suites that can be used to run a set of checks on your data, alongside +# with their default conditions and thresholds. You can read more about customizing and creating your own suites in the +# :doc:`Customizations Guide `. # -# 1. `Model Evaluation Checks`_ - checks to run once we have trained our model. -# 2. `Data Integrity Checks`_ - checks to run on our dataset, before we train our model. +# Data Integrity +# -------------- +# We will start by doing preliminary integrity check to validate the text formatting. It is recommended to do this step +# before model training as it may imply additional data engineering is required. # -# Additionally ``deepchecks.nlp`` currently has one `Train-Test Validation` Check - the -# :class:`Label Drift ` Check. You can -# read more about when should you use deepchecks :ref:`here `. +# We'll do that using the :mod:`data_integrity ` pre-built suite. + +from deepchecks.nlp.suites import data_integrity + +data_integrity_suite = data_integrity() +data_integrity_suite.run(train, test) + +# %% +# Integrity #1: Unknown Tokens +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# Model Evaluation Checks -# ----------------------- +# First up (in the “Didn't Pass” tab) we see that the Unknown Tokens check +# has returned a problem. # -# We'll start by running the -# :class:`PredictionDrift ` -# check, which will let us know if there has been a significant change in the model's predictions between the train -# and test data. Such a change may imply that something has changed in the data distribution between the train and -# test data in a way that affects the model's predictions. +# Looking at the result, we can see that it assumed (by default) that +# we’re going to use the bert-base-uncased tokenizer for our NLP model, +# and that if that’s the case there are many words in the dataset that +# contain characters (such as emojis, or Korean characters) that are +# unrecognized by the tokenizer. This is an important insight, as bert +# tokenizers are very common. You can configure the tokenizer used by +# this check by passing the tokenizer to the check’s constructor, and can +# also configure the threshold for the percent of unknown tokens allowed by +# modifying the checks condition. # -# We'll also add a condition to the check, which will make it fail if the drift score is higher than 0.1. - -# Start by computing the predictions for the train and test data: -train_preds, train_probas = model.predict(train_embeddings), model.predict_proba(train_embeddings) -test_preds, test_probas = model.predict(test_embeddings), model.predict_proba(test_embeddings) - -# Run the check -from deepchecks.nlp.checks import PredictionDrift +# Integrity #2: Text Outliers +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# In the “Other” tab, Looking at the Text Outlier check result we can +# derive several insights by hovering over the different values and inspecting the outlier texts: +# +# 1. hashtags (‘#…’) are usually several words +# written together without spaces - we might consider splitting them +# before feeding the tweet to a model +# 2. In some instances users +# deliberately misspell words, for example ‘!’ instead of the letter ‘l’ +# or ‘okayyyyyyyyyy’. +# 3. The majority of the data is in English but not +# all. If we want a classifier that is multilingual we should collect +# more data, otherwise we may consider dropping tweets in other languages +# from our dataset before training our model. +# +# Integrity #3: Property-Label Correlation (Shortcut Learning) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# In the "Passed" tab we can see tha Property-Label Correlation check, that verifies the data does not contain +# any shortcuts the model can fixate on during the learning process. In +# our case we can see no indication that this problem exists in our +# dataset. For more information about shortcut learning see: +# https://towardsdatascience.com/shortcut-learning-how-and-why-models-cheat-1b37575a159 + +# %% +# Train Test Validation +# --------------------- +# +# The next suite, the :mod:`train_test_validation ` suite serves to validate our split and +# compare the two dataset. These splits can be either you training and val / test sets, in which case you'd want to run +# this suite after the split was made but before training, or for example your training and inference data, in which +# case the suite is useful for validating that the inference data is similar enough to the training data. -check = PredictionDrift().add_condition_drift_score_less_than(0.1) -result = check.run(train, test, train_predictions=list(train_preds), test_predictions=list(test_preds)) +from deepchecks.nlp.suites import train_test_validation -# Note: the result can be saved as html using suite_result.save_as_html() -# or exported to json using suite_result.to_json() -result.show() +train_test_validation().run(train, test) -#%% -# We can see that the check passed, and that the drift score is quite low. +# %% +# Label Drift +# ~~~~~~~~~~~ # -# Next, we'll run the -# :class:`MetadataSegmentsPerformance ` -# check, which will check the performance of the model on different segments of the metadata that we provided -# earlier when creating the :class:`TextData ` objects, and report back on any segments that have significantly lower -# performance than the rest of the data. +# This check, appearing in the "Didn't Pass" tab, lets us see that we have some significant change in the +# distribution of the label - the label “optimism” is suddenly way more common in the test dataset, while other +# labels declined. This happened because we split on time, so the topics covered by the tweets in the test dataset +# may correspond to specific trends or events that happened later in time. Let’s investigate! + +# %% +# Model Evaluation +# ---------------- # +# The suite below, the :mod:`model_evaluation ` suite, is designed to be run after a model has +# been trained and requires model predictions which can be supplied via the relevant arguments in the ``run`` function. -from deepchecks.nlp.checks import MetadataSegmentsPerformance +train_preds, test_preds = tweet_emotion.load_precalculated_predictions( + pred_format='predictions', as_train_test=True) +train_probas, test_probas = tweet_emotion.load_precalculated_predictions( + pred_format='probabilities', as_train_test=True) -check = MetadataSegmentsPerformance() +from deepchecks.nlp.suites import model_evaluation -result = check.run(test, predictions=list(test_preds), probabilities=test_probas) +result = model_evaluation().run(train, test, + train_predictions=train_preds, + test_predictions=test_preds, + train_probabilities=train_probas, + test_probabilities=test_probas) result.show() -#%% -# As we can see, the check found a segment that has significantly lower performance than the rest of the data. In the -# first tab of the display we can see that there is a large segment of young Europeans that have significantly lower -# performance than the rest of the data. Perhaps there is some language gap here? We should probably collect and -# annotate more data from this segment. +# %% +# OK! We have many important issues being surfaced by this suite. Let’s dive into the individual checks: # -# Properties -# ^^^^^^^^^^ +# Model Eval #1: Train Test Performance +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# Properties are one-dimension values that are extracted from the text. Among their uses, they can be used to -# segment the data, similar to the metadata segments that we saw in the previous check. +# We can immediately see in the "Didn't Pass" tab that there has been significant degradation in the Recall on +# class “optimism”. This is very likely a result of the severe label drift we saw after running the previous suite. # -# Before we can run the -# :class:`PropertySegmentsPerformance ` -# check, we need to make sure that our :class:`TextData ` objects have the properties that we -# want to use. Properties can be added to the TextData objects in one of the following ways: +# Model Eval #2: Segment Performance +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# 1. Calculated automatically by deepchecks. Deepchecks has a set of predefined properties that can be calculated -# automatically. They can be added to the TextData object either by passing ``properties='auto'`` to the TextData -# constructor, or by calling the -# :meth:`calculate_default_properties ` method anytime later. -# 2. You can calculate your own properties and then add them to the TextData object. This can be done by passing a -# DataFrame of properties to the TextData `properties` argument, or by calling the -# :meth:`set_properties ` method anytime later with such a DataFrame. You +# Also in the "Didn't Pass" tab we can see the two segment performance checks - Property Segment Performance and +# Metadata Segment Performance. These use the :ref:`metadata columns ` of user related +# information OR our :ref:`calculated properties ` to try and **automatically** +# detect significant data segments on which our model performs badly. # -# .. note:: +# In this case we can see that both checks have found issues in the test +# dataset: +# +# 1. The Property Segment Performance check has found that we’re +# getting very poor results on low toxicity samples. That probably means +# that our model is using the toxicity of the text to infer the “anger” +# label, and is having a harder problem with other, more benign text +# samples. +# 2. The Metadata Segment Performance check has found that we +# have predicting correct results on new users from the Americas. That’s +# 5% of our dataset so we better investigate that further. # -# Some default properties require additional packages to be installed. If you want to use them, you can -# install them by running ``pip install deepchecks[nlp-properties]``. -# Additionally, some properties that use the ``transformers`` package are computationally expensive, and may take -# a long time to calculate. If you have a GPU or a similar device you can use it by installing the appropriate -# package versions and passing a ``device`` argument to the ``TextData`` constructor or to the -# ``calculate_default_properties`` method. +# You'll note that these two issues occur only in the test data, and so the results of these checks for the +# training data appear in the "Passed" tab. # +# Model Eval #3: Prediction Drift +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# We note that the Prediction Drift (here in the “Passed” tab) shows no +# issue. Given that we already know that there is significant Label Drift, +# this means we have Concept Drift - the labels corresponding to our +# samples have changed, while the model continues to predict the same +# labels. You can learn more about the different types of drift and how deepchecks detects them in our +# :ref:`Drift Guide `. + +# %% +# Running Individual Checks +# ========================= +# +# Checks can also be run individually. In this section, we'll show two of the more interesting checks and how you can +# run them stand-alone and add conditions to them. You can learn more about customizing suites, checks and conditions +# in our :doc:`Customizations Guide `. +# +# Embeddings Drift +# ---------------- +# +# In order to run the :ref:`Embeddings Drift ` check +# you must have text embeddings loaded to both datasets. You can read more about using embeddings in deepchecks NLP in +# our :ref:`Embeddings Guide `. +# +# In this example, we have the embeddings already +# pre-calculated: -# Calculate properties -train.calculate_default_properties() -test.calculate_default_properties() +from deepchecks.nlp.datasets.classification.tweet_emotion import load_embeddings -# Run the check -from deepchecks.nlp.checks import PropertySegmentsPerformance +train_embeddings, test_embeddings = load_embeddings() -check = PropertySegmentsPerformance(segment_minimum_size_ratio=0.07) -result = check.run(test, predictions=list(test_preds), probabilities=test_probas) -result.show() +train.set_embeddings(train_embeddings) +test.set_embeddings(test_embeddings) -#%% -# As we can see, the check found some segments that have lower performance compared to the rest of the dataset. It seems -# that the model has a harder time predicting the emotions in the "neutral-positive" sentiment range (in our case, -# between around 0 and 0.45). -# -# Data Integrity Checks -# --------------------- -# -# These previous checks were all about the model's performance. Now we'll run a check that attempts to find instances -# of shortcut learning - cases in which the label can be predicted by simple aspects of the data, which -# in many cases can be an indication that the model has used some information that won't generalize to the real world. +# %% +# You can also calculate the embeddings using deepchecks, either using an +# open-source sentence-transformer or using Open AI’s embedding API. + +# train.calculate_default_embeddings() +# test.calculate_default_embeddings() + +# %% # -# This check is the -# :class:`PropertyLabelCorrelation ` -# check, which will check the correlation between the properties and the labels, and report back on any properties that -# have a high correlation with the labels. -from deepchecks.nlp.checks import PropertyLabelCorrelation +from deepchecks.nlp.checks import TextEmbeddingsDrift -check = PropertyLabelCorrelation(n_top_features=10) -result = check.run(test) -result.show() +check = TextEmbeddingsDrift() +res = check.run(train, test) +res.show() -#%% -# In this case the check didn't find any properties that have a high correlation with the labels. Apart from the -# sentiment property, which is expected to have high relevance to the emotion of the tweet, the other properties -# have very low correlation to the label. +# %% +# Here we can see some clusters that distinctly contain more +# samples from train or more sample for test. For example, if we look at +# the greenish cluster in the middle (by hovering on the samples and reading the tweets) we see it’s full of +# inspirational quotes and sayings, and belongs mostly to the test dataset. That is the +# source of the drastic increase in optimistic labels! +# +# There are of course also other note-worthy clusters, such as the greenish cluster on the right that contains tweets +# about a terror attack in Bangladesh, which belongs solely to the test data. + +# %% +# Under Annotated Segments +# ------------------------ +# +# Another note-worthy segment is the +# :ref:`Under Annotated Segments ` check, +# which explores our data and automatically identifies segments where the data +# is under-annotated - meaning that the ratio of missing labels is higher. +# To this check we’ll also add a condition that will fail in case that +# an under-annotated segment of significant size is found. + +from deepchecks.nlp.checks import UnderAnnotatedPropertySegments +test_under = tweet_emotion.load_under_annotated_data() + +check = UnderAnnotatedPropertySegments( + segment_minimum_size_ratio=0.1 +).add_condition_segments_relative_performance_greater_than() + +check.run(test_under) + +# %% +# For example, here the check detected that we have a lot of lacking annotations for samples that are informal and +# not very fluent. May it be the case that our annotators have a problem annotating these samples and prefer not to +# deal with them? If these samples are important for use, we may have to put special focus on annotating this segment. +# +# .. note:: # -# You can find the full list of available NLP checks in the :mod:`nlp.checks api documentation `. +# You can find the full list of available NLP checks in the :mod:`nlp.checks api documentation ֿ +# `. # sphinx_gallery_thumbnail_path = '_static/images/sphinx_thumbnails/nlp_quickstarts/getting_started.png' \ No newline at end of file From cac68b98f9246846fb12ca2cc3341cceebc26c6c Mon Sep 17 00:00:00 2001 From: matanper Date: Thu, 18 May 2023 17:50:23 +0300 Subject: [PATCH 17/20] fix set_precision in pandas deprecated (#2543) * fix set_precision in pandas deprecated * update comment --- deepchecks/core/serialization/dataframe/html.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/deepchecks/core/serialization/dataframe/html.py b/deepchecks/core/serialization/dataframe/html.py index f945fdd911..d2af8b61a6 100644 --- a/deepchecks/core/serialization/dataframe/html.py +++ b/deepchecks/core/serialization/dataframe/html.py @@ -14,6 +14,7 @@ import pandas as pd from pandas.io.formats.style import Styler +from pkg_resources import parse_version from deepchecks.core.serialization.abc import HtmlSerializer @@ -49,7 +50,11 @@ def serialize(self, **kwargs) -> str: # Using deprecated pandas method so hiding the warning with warnings.catch_warnings(): warnings.simplefilter(action='ignore', category=FutureWarning) - df_styler.set_precision(2) + # Set precision is deprecated since pandas 1.3.0 + if parse_version(pd.__version__) < parse_version('1.3.0'): + df_styler.set_precision(2) + else: + df_styler.format(precision=2) table_css_props = [ ('text-align', 'left'), # Align everything to the left ('white-space', 'pre-wrap') # Define how to handle white space characters (like \n) From a886103c14af0f345c8d679ef5a673e56391d2f7 Mon Sep 17 00:00:00 2001 From: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com> Date: Thu, 18 May 2023 18:38:53 +0300 Subject: [PATCH 18/20] multi label quick start (#2541) * example for multilabel --------- Co-authored-by: Noam Bressler Co-authored-by: shir22 <33841818+shir22@users.noreply.github.com> --- .../just_dance_comment_analysis.py | 6 + deepchecks/nlp/suites/default_suites.py | 5 +- .../general/usage/customizations/README.txt | 2 + .../plot_multi_label_classification.py | 249 ++++++++++++++++++ .../quickstarts/plot_text_classification.py | 17 +- .../nlp/usage_guides/supported_tasks.rst | 8 +- .../weak_segment_performance_test.py | 19 ++ tests/nlp/conftest.py | 16 +- 8 files changed, 306 insertions(+), 16 deletions(-) create mode 100644 docs/source/nlp/tutorials/quickstarts/plot_multi_label_classification.py diff --git a/deepchecks/nlp/datasets/classification/just_dance_comment_analysis.py b/deepchecks/nlp/datasets/classification/just_dance_comment_analysis.py index 490931e43f..822c14a4f9 100644 --- a/deepchecks/nlp/datasets/classification/just_dance_comment_analysis.py +++ b/deepchecks/nlp/datasets/classification/just_dance_comment_analysis.py @@ -27,6 +27,7 @@ """ import pathlib import typing as t +import warnings import numpy as np import pandas as pd @@ -185,6 +186,11 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True, use_ful """ if data_format.lower() not in ['textdata', 'dataframe']: raise ValueError('data_format must be either "Dataset" or "Dataframe"') + elif data_format.lower() == 'dataframe': + if include_properties or include_embeddings: + warnings.warn('include_properties and include_embeddings are incompatible with data_format="Dataframe"', + UserWarning) + include_properties, include_embeddings = False, False if use_full_size: data = read_and_save_data(ASSETS_DIR, 'just_dance_data.csv', _FULL_DATA_URL, to_numpy=False) diff --git a/deepchecks/nlp/suites/default_suites.py b/deepchecks/nlp/suites/default_suites.py index c4bdb49aa0..0f6d14f39a 100644 --- a/deepchecks/nlp/suites/default_suites.py +++ b/deepchecks/nlp/suites/default_suites.py @@ -106,7 +106,9 @@ def train_test_validation(n_samples: int = None, 'Train Test Validation Suite', PropertyDrift(**kwargs).add_condition_drift_score_less_than(), LabelDrift(**kwargs).add_condition_drift_score_less_than(), - TrainTestSamplesMix(**kwargs).add_condition_duplicates_ratio_less_or_equal() + TrainTestSamplesMix(**kwargs).add_condition_duplicates_ratio_less_or_equal(), + TextEmbeddingsDrift(**kwargs).add_condition_overall_drift_value_less_than() + ) @@ -148,7 +150,6 @@ def model_evaluation(n_samples: int = None, PredictionDrift(**kwargs).add_condition_drift_score_less_than(), PropertySegmentsPerformance(**kwargs).add_condition_segments_relative_performance_greater_than(), MetadataSegmentsPerformance(**kwargs).add_condition_segments_relative_performance_greater_than(), - TextEmbeddingsDrift().add_condition_overall_drift_value_less_than() ) diff --git a/docs/source/general/usage/customizations/README.txt b/docs/source/general/usage/customizations/README.txt index 6067e438e4..330c837917 100644 --- a/docs/source/general/usage/customizations/README.txt +++ b/docs/source/general/usage/customizations/README.txt @@ -1,3 +1,5 @@ +.. _general__customizations: + ====================== Customizations ====================== diff --git a/docs/source/nlp/tutorials/quickstarts/plot_multi_label_classification.py b/docs/source/nlp/tutorials/quickstarts/plot_multi_label_classification.py new file mode 100644 index 0000000000..cd4e6cdafa --- /dev/null +++ b/docs/source/nlp/tutorials/quickstarts/plot_multi_label_classification.py @@ -0,0 +1,249 @@ +# -*- coding: utf-8 -*- +""" +.. _nlp__multilabel_quickstart: + +NLP Multi Label Classification Quickstart +***************************************** + +In this quickstart guide, we will go over using the deepchecks NLP package to analyze and evaluate a text +multi label classification task. If you are interested in a regular multiclass classification task, you can +refer to our :ref:`Multiclass Quickstart `. We will cover the following: + +1. `Creating a TextData object and auto calculating properties <#setting-up>`__ +2. `Running the built-in suites <#running-the-deepchecks-default-suites>`__ +3. `Running individual checks <#running-individual-checks>`__ + +To run deepchecks for NLP, you need the following for both your train and test data: + +1. Your text data - a list of strings, each string is a single sample (can be a sentence, paragraph, document, etc.). +2. Your labels and prediction in the :ref:`correct format ` (Optional). +3. :ref:`Metadata `, :ref:`Properties ` + or :ref:`Embeddings ` for the provided text data (Optional). + +If you don't have deepchecks installed yet: + +.. code:: python + + import sys + !{sys.executable} -m pip install deepchecks[nlp] -U --quiet #--user + +Some properties calculated by ``deepchecks.nlp`` require additional packages to be installed. You can +install them by running: + +.. code:: python + + import sys + !{sys.executable} -m pip install [nlp-properties] -U --quiet #--user + +Setting Up +========== + +Load Data +--------- +For the purpose of this guide, we'll use a small subset of the +`just dance `__ comment analysis dataset. +A dataset containing comments, metadata and labels for a multilabel category classification use case on youtube comments. + +""" + +from deepchecks.nlp import TextData +from deepchecks.nlp.datasets.classification import just_dance_comment_analysis + +data = just_dance_comment_analysis.load_data(data_format='DataFrame', + as_train_test=False) +metadata_cols = ['likes', 'dateComment'] +data.head(2) + +# %% +# Create TextData Objects +# ------------------------ +# +# Deepchecks' :ref:`TextData ` object contains the text samples, labels, and possibly +# also properties and metadata. It stores +# cache to save time between repeated computations and contains functionalities for input validations and sampling. + +label_cols = data.drop(columns=['originalText'] + metadata_cols) +class_names = label_cols.columns.to_list() +dataset = TextData(data['originalText'], label=label_cols.to_numpy().astype(int), + task_type='text_classification', + metadata=data[metadata_cols], categorical_metadata=[]) + +# %% +# Calculating Properties +# ---------------------- +# +# Some of deepchecks' checks use properties of the text samples for various calculations. Deepcheck has a wide +# variety of such properties, some simple and some that rely on external models and are more heavy to run. In order +# for deepchecks' checks to be able to access the properties, they must be stored within the +# :ref:`TextData ` object. You can read more about properties in the +# :ref:`Property Guide `. + +# properties can be either calculated directly by Deepchecks +# or imported from other sources in appropriate format + +# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +# dataset.calculate_builtin_properties(include_long_calculation_properties=True, device=device) + +properties = just_dance_comment_analysis.load_properties(as_train_test=False) +dataset.set_properties(properties, categorical_properties=['Language']) +dataset.properties.head(2) + +# %% +# Running the deepchecks default suites +# ===================================== +# +# Deepchecks comes with a set of pre-built suites that can be used to run a set of checks on your data, alongside +# with their default conditions and thresholds. You can read more about customizing and creating your own suites in the +# :ref:`Customizations Guide `. +# +# Data Integrity +# -------------- +# We will start by doing preliminary integrity check to validate the text formatting. It is recommended to do this step +# before your train and test/validation splits and model training as it may imply additional data +# engineering is required. +# +# We'll do that using the :mod:`data_integrity ` pre-built suite. Note that we are limiting +# the number of samples to 1000 in order to get quick high level overview of potential issues. + +from deepchecks.nlp.suites import data_integrity + +data_integrity_suite = data_integrity(n_samples=1000) +data_integrity_suite.run(dataset, model_classes=class_names) + +# %% +# Integrity #1: Unknown Tokens +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# First up (in the “Didn’t Pass” tab) we see that the Unknown Tokens check +# has returned a problem. +# +# Looking at the result, we can see that it assumed (by default) that +# we’re going to use the bert-base-uncased tokenizer for our NLP model, +# and that if that’s the case there are many words in the dataset that +# contain characters (specifically here emojis) that are +# unrecognized by the tokenizer. This is an important insight, as bert +# tokenizers are very common. +# +# Integrity #2: Conflicting Labels +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Looking at the Conflicting Labels check result (in the “Didn’t Pass” tab) we can +# see that there are 2 occurrences of duplicate samples that have different labels. +# This may suggest a more severe labeling error in the dataset which we would want to explore further. +# + +# %% +# Train Test Validation +# --------------------- +# +# The next suite, the :mod:`train_test_validation ` suite serves to validate our split and +# compare the two dataset. These splits can be either you training and val / test sets, in which case you'd want to run +# this suite after the split was made but before training, or for example your training and inference data, in which +# case the suite is useful for validating that the inference data is similar enough to the training data. +# +# To run this suite we'll split the data into train and test/validation sets. We'll use a predefined split based +# on comment dates. + +from deepchecks.nlp.suites import train_test_validation + +train_ds, test_ds = just_dance_comment_analysis.load_data( + data_format='TextData', as_train_test=True, + include_embeddings=True, include_properties=True) +train_test_validation(n_samples=1000).run(train_ds, test_ds, + model_classes=class_names) + +# %% +# Train Test Validation #1: Properties Drift +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Based on the different properties we have calculated for the dataset, we can now search for +# properties whose distribution changes between the train and test datasets. Changes like this +# are especially important to look for when monitoring your model over time, as data drift +# is one of the top reasons why machine learning model’s performance degrades over time. +# +# In our case, we can see that the “% Special Characters” and the "Formality" property have different distributions +# between train and test. Drilling further into the results, we can see that the language of the comments in the +# test set is much less formal and includes more special characters (possibly emojis?) than the train set. +# Since this change is quite significant, we may want to consider adding more informal comments containing +# special characters to the train set before training (or retraining) our model. +# +# Train Test Validation #2: Embedding Drift +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Similarly to the properties drift, we can also look for embedding drift between the train and test datasets. +# The benefit of using embedding on top of the properties is that they are able to detect semantic changes in the data. +# +# In our case, we see there are significant semantic differences between the train and test sets. Specifically, +# we can see some clusters that distinctly contain more samples from train or more samples from the train dataset or +# more sample from the test dataset. By hovering over the clusters we can read the user comments understand what is +# the difference between the clusters. + +# %% +# Model Evaluation +# ---------------- +# +# The suite below, the :mod:`model_evaluation ` suite, is designed to be run after a model has +# been trained and requires model predictions which can be supplied via the relevant arguments in the ``run`` function. + +train_preds, test_preds = just_dance_comment_analysis.\ + load_precalculated_predictions(pred_format='predictions', + as_train_test=True) +train_probas, test_probas = just_dance_comment_analysis.\ + load_precalculated_predictions(pred_format='probabilities', + as_train_test=True) + +from deepchecks.nlp.suites import model_evaluation + +suite = model_evaluation(n_samples=1000) +result = suite.run(train_ds, test_ds, + train_predictions=train_preds, + test_predictions=test_preds, + train_probabilities=train_probas, + test_probabilities=test_probas, + model_classes=class_names) +result.show() + +# %% +# Model Eval #1: Train Test Performance +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# We can immediately see in the "Didn't Pass" tab that there has been significant degradation in the Recall on +# class “Pain and Discomfort”. Moreover, it seems there is a general deterioration in our model +# performance on the test set compared to the train set. This can be explained +# based on the data drift we saw in the previous suite. +# + +# %% +# Running Individual Checks +# ========================= +# +# Checks can also be run individually as well as within a suite. You can learn more about customizing suites, +# checks and conditions in our :ref:`Customizations Guide `. In this section, we'll show you +# how to do that while showcasing one of our most interesting checks - :ref:`PropertySegmentPerformance +# `. +# + +from deepchecks.nlp.checks import PropertySegmentsPerformance + +check = PropertySegmentsPerformance(segment_minimum_size_ratio=0.05) +check = check.add_condition_segments_relative_performance_greater_than(0.1) +result = check.run(test_ds, probabilities=test_probas) +result.show() + +# %% +# In the display we can see some distinct property based segments that our model under performs on. +# +# By reviewing the results we can see that our model is performing poorly on samples that have a low level of +# Subjectivity, by looking at the "Subjectivity vs Average Sentence Length" tab +# We can see that the problem is even more severe on samples containing long sentences. +# +# In addition to the visual display, most checks also return detailed data describing the results. This data can be +# used for further analysis, create custom visualizations or to set custom conditions. +# + +result.value['weak_segments_list'].head(3) + +# %% +# You can find the full list of available NLP checks in the +# :mod:`nlp.checks api documentation ֿ `. +# diff --git a/docs/source/nlp/tutorials/quickstarts/plot_text_classification.py b/docs/source/nlp/tutorials/quickstarts/plot_text_classification.py index 958d37a8b8..72509c1be3 100644 --- a/docs/source/nlp/tutorials/quickstarts/plot_text_classification.py +++ b/docs/source/nlp/tutorials/quickstarts/plot_text_classification.py @@ -1,10 +1,13 @@ # -*- coding: utf-8 -*- """ +.. _nlp__multiclass_quickstart: + Test NLP Classification Tasks - Quickstart ****************************************** In this quickstart guide, we will go over using the deepchecks NLP package to analyze and evaluate text -classification tasks. We will cover the following steps: +classification tasks. If you are interested in a multilabel classification task, you can +refer to our :ref:`Multilabel Quickstart `. We will cover the following steps: 1. `Creating a TextData object and auto calculating properties <#setting-up>`__ 2. `Running the built-in suites and inspecting the results <#running-the-deepchecks-default-suites>`__ @@ -59,7 +62,7 @@ # Create a TextData Objects # ------------------------- # -# We can now create a :class:`TextData ` object for the train and test dataframes. +# We can now create a :ref:`TextData ` object for the train and test dataframes. # This object is used to pass your data to the deepchecks checks. # # To create a TextData object, the only required argument is the text itself, but passing only the text @@ -80,17 +83,17 @@ # Some of deepchecks' checks use properties of the text samples for various calculations. Deepcheck has a wide # variety of such properties, some simple and some that rely on external models and are more heavy to run. In order # for deepchecks' checks to be able to access the properties, they must be stored within the -# :class:`TextData ` object. You can read more about properties in the +# :ref:`TextData ` object. You can read more about properties in the # :ref:`Property Guide `. # properties can be either calculated directly by Deepchecks # or imported from other sources in appropriate format # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') -# train.calculate_default_properties( +# train.calculate_builtin_properties( # include_long_calculation_properties=True, device=device # ) -# test.calculate_default_properties( +# test.calculate_builtin_properties( # include_long_calculation_properties=True, device=device # ) @@ -107,7 +110,7 @@ # # Deepchecks comes with a set of pre-built suites that can be used to run a set of checks on your data, alongside # with their default conditions and thresholds. You can read more about customizing and creating your own suites in the -# :doc:`Customizations Guide `. +# :ref:`Customizations Guide `. # # Data Integrity # -------------- @@ -255,7 +258,7 @@ # # Checks can also be run individually. In this section, we'll show two of the more interesting checks and how you can # run them stand-alone and add conditions to them. You can learn more about customizing suites, checks and conditions -# in our :doc:`Customizations Guide `. +# in our :ref:`Customizations Guide `. # # Embeddings Drift # ---------------- diff --git a/docs/source/nlp/usage_guides/supported_tasks.rst b/docs/source/nlp/usage_guides/supported_tasks.rst index 868440eb6f..649a6bb46c 100644 --- a/docs/source/nlp/usage_guides/supported_tasks.rst +++ b/docs/source/nlp/usage_guides/supported_tasks.rst @@ -9,7 +9,7 @@ In the deepchecks nlp package, predictions are passed into the suite / check ``r predictions only (passing a fitted model is currently not supported). -.. _nlp_supported_tasks__types: +.. _nlp__supported_tasks_types: Supported Task Types ==================== @@ -33,7 +33,7 @@ Deepchecks currently supports two NLP task types: - Named Entity Recognition, - Part-of-speech annotation (in which all tokens have a non-background class). -.. _nlp_supported_labels__predictions_format: +.. _nlp__supported_labels_predictions_format: Supported Labels and Predictions Format ======================================= @@ -42,7 +42,7 @@ While labels are passed when constructing the :class:`TextData >> predictions = [[0, 0, 1], [0, 1, 1]] >>> probabilities = [[0.2, 0.3, 0.8], [0.4, 0.9, 0.6]] -.. _nlp_supported_token_classification: +.. _nlp__supported_token_classification: Token Classification -------------------- diff --git a/tests/nlp/checks/model_evaluation/weak_segment_performance_test.py b/tests/nlp/checks/model_evaluation/weak_segment_performance_test.py index 5bcd525cfa..baf4fc78db 100644 --- a/tests/nlp/checks/model_evaluation/weak_segment_performance_test.py +++ b/tests/nlp/checks/model_evaluation/weak_segment_performance_test.py @@ -102,3 +102,22 @@ def test_multilabel_dataset(multilabel_mock_dataset_and_probabilities): assert_that(result.value['avg_score'], close_to(0.83, 0.001)) assert_that(len(result.value['weak_segments_list']), is_in([5, 6])) # TODO: check why it's not always 5 assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.695, 0.01)) + + +def test_multilabel_just_dance(just_dance_train_test_textdata, just_dance_train_test_textdata_probas): + # Arrange + _, data = just_dance_train_test_textdata + _, probabilities = just_dance_train_test_textdata_probas + assert_that(data.is_multi_label_classification(), equal_to(True)) + + data = data.copy(rows_to_use = range(1000)) + probabilities = probabilities[:1000, :] + check = PropertySegmentsPerformance() + + # Act + result = check.run(data, probabilities=probabilities) + + # Assert + assert_that(result.value['avg_score'], close_to(0.615, 0.001)) + assert_that(len(result.value['weak_segments_list']), is_in([79, 80])) # TODO: check why it's not always 80 + assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.401, 0.01)) diff --git a/tests/nlp/conftest.py b/tests/nlp/conftest.py index 64985c27f2..186640a729 100644 --- a/tests/nlp/conftest.py +++ b/tests/nlp/conftest.py @@ -38,12 +38,22 @@ def tweet_emotion_train_test_textdata(): include_embeddings=True) return train, test +@pytest.fixture(scope='session') +def just_dance_train_test_textdata(): + """Just Dance text multilabel classification dataset""" + return just_dance_comment_analysis.load_data(data_format='TextData', as_train_test=True, + include_embeddings=True) + +@pytest.fixture(scope='session') +def just_dance_train_test_textdata_probas(): + """Just Dance text multilabel classification dataset""" + return just_dance_comment_analysis.load_precalculated_predictions(pred_format='probabilities', as_train_test=True) + @pytest.fixture(scope='session') -def just_dance_train_test_textdata_sampled(): +def just_dance_train_test_textdata_sampled(just_dance_train_test_textdata): """Just Dance text multilabel classification dataset""" - train, test = just_dance_comment_analysis.load_data(data_format='TextData', as_train_test=True, - include_embeddings=True) + train, test = just_dance_train_test_textdata sampled_train = train.sample(500, random_state=42) sampled_test = test.sample(500, random_state=42) return sampled_train, sampled_test From a1f921ef26433e357c790e1bcc577d5bf937b824 Mon Sep 17 00:00:00 2001 From: Harsh Jain Date: Sun, 21 May 2023 11:58:46 +0530 Subject: [PATCH 19/20] Fixes #2524 Refactoring of the calculate_default_properties function and adding new text properties (#2536) * Refactoring of the calculate_default_properties function and adding new properties --------- Co-authored-by: Noam Bressler --- .gitignore | 12 +- deepchecks/nlp/text_data.py | 12 +- deepchecks/nlp/utils/__init__.py | 4 +- deepchecks/nlp/utils/text_properties.py | 160 +++++++++++++-- .../plot_property_drift.py | 4 +- .../nlp/usage_guides/nlp_properties.rst | 73 ++++--- .../nlp/usage_guides/text_data_object.rst | 6 +- .../under_annotated_segments_test.py | 2 +- .../weak_segment_performance_test.py | 44 ++-- .../property_drift_test.py | 27 ++- tests/nlp/test_text_data.py | 3 +- tests/nlp/utils/test_properties.py | 194 ++++++++++++++++-- 12 files changed, 436 insertions(+), 105 deletions(-) diff --git a/.gitignore b/.gitignore index 484a145dcd..cfa8244452 100644 --- a/.gitignore +++ b/.gitignore @@ -138,4 +138,14 @@ benchmarks/results deepchecks/nlp/utils/.nlp-models # embedding files -tests/nlp/utils/embeddings.csv \ No newline at end of file +tests/nlp/utils/embeddings.csv +embeddings.csv.npy +embeddings.npy +deepchecks/nlp/datasets/assets/tweet_emotion/tweet_emotion_embeddings.npy + +# nlp datasets +deepchecks/nlp/datasets/assets/just_dance_comment_analysis + +# nlp test properties +metadata.csv +test_properties.csv \ No newline at end of file diff --git a/deepchecks/nlp/text_data.py b/deepchecks/nlp/text_data.py index dac7ebc97e..c849c781e9 100644 --- a/deepchecks/nlp/text_data.py +++ b/deepchecks/nlp/text_data.py @@ -23,7 +23,7 @@ validate_raw_text, validate_tokenized_text) from deepchecks.nlp.task_type import TaskType, TTextLabel from deepchecks.nlp.utils.text_embeddings import calculate_default_embeddings -from deepchecks.nlp.utils.text_properties import calculate_default_properties +from deepchecks.nlp.utils.text_properties import calculate_builtin_properties from deepchecks.utils.logger import get_logger from deepchecks.utils.metrics import is_label_none from deepchecks.utils.validation import is_sequence_not_str @@ -85,7 +85,7 @@ class TextData: properties. If None, no properties are set. The number of rows in the properties DataFrame must be equal to the number of samples in the dataset, and the order of the rows must be the same as the order of the samples in the dataset. - In order to calculate the default properties, use the `TextData.calculate_default_properties` function after + In order to calculate the default properties, use the `TextData.calculate_builtin_properties` function after the creation of the TextData object. For more on properties, see the `NLP Properties Guide `_. @@ -372,7 +372,7 @@ def set_metadata( self._metadata = metadata.reset_index(drop=True) self._cat_metadata = column_types.categorical_columns - def calculate_default_properties( + def calculate_builtin_properties( self, include_properties: t.Optional[t.List[str]] = None, ignore_properties: t.Optional[t.List[str]] = None, @@ -398,7 +398,7 @@ def calculate_default_properties( if self._properties is not None: warnings.warn('Properties already exist, overwriting them', UserWarning) - properties, properties_types = calculate_default_properties( + properties, properties_types = calculate_builtin_properties( list(self.text), include_properties=include_properties, ignore_properties=ignore_properties, @@ -442,7 +442,7 @@ def save_properties(self, path: str): if self._properties is None: raise DeepchecksNotSupportedError( 'TextData does not contain properties, add them by using ' - '"calculate_default_properties" or "set_properties" functions' + '"calculate_builtin_properties" or "set_properties" functions' ) self._properties.to_csv(path, index=False) @@ -454,7 +454,7 @@ def properties(self) -> pd.DataFrame: raise DeepchecksNotSupportedError( 'Functionality requires properties, but the the TextData object had none. To use this functionality, ' 'use the set_properties method to set your own properties with a pandas.DataFrame or use ' - 'TextData.calculate_default_properties to add the default deepchecks properties.' + 'TextData.calculate_builtin_properties to add the default deepchecks properties.' ) return self._properties diff --git a/deepchecks/nlp/utils/__init__.py b/deepchecks/nlp/utils/__init__.py index 18f6cbf94b..eabb208700 100644 --- a/deepchecks/nlp/utils/__init__.py +++ b/deepchecks/nlp/utils/__init__.py @@ -12,10 +12,10 @@ from deepchecks.nlp.utils.llm_utils import call_open_ai_completion_api from deepchecks.nlp.utils.text_embeddings import calculate_default_embeddings -from deepchecks.nlp.utils.text_properties import calculate_default_properties +from deepchecks.nlp.utils.text_properties import calculate_builtin_properties __all__ = [ - 'calculate_default_properties', + 'calculate_builtin_properties', 'calculate_default_embeddings', 'call_open_ai_completion_api' ] diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py index 89ebaac8f2..e73e21b35d 100644 --- a/deepchecks/nlp/utils/text_properties.py +++ b/deepchecks/nlp/utils/text_properties.py @@ -11,6 +11,7 @@ """Module containing the text properties for the NLP module.""" import importlib import pathlib +import re import string import warnings from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union @@ -28,7 +29,7 @@ from deepchecks.utils.function import run_available_kwargs from deepchecks.utils.ipython import create_progress_bar -__all__ = ['calculate_default_properties'] +__all__ = ['calculate_builtin_properties'] MODELS_STORAGE = pathlib.Path(__file__).absolute().parent / '.nlp-models' @@ -411,10 +412,10 @@ def readability_score(raw_text: Sequence[str]) -> List[float]: for text in raw_text: if not pd.isna(text): sentence_count = len(sent_tokenize(text)) - text = remove_punctuation(text) + text = remove_punctuation(text.lower()) words = word_tokenize(text) word_count = len(words) - syllable_count = sum([len(cmudict_dict[word.lower()]) for word in words if word.lower() in cmudict_dict]) + syllable_count = sum([len(cmudict_dict[word]) for word in words if word in cmudict_dict]) if word_count != 0 and sentence_count != 0 and syllable_count != 0: avg_syllables_per_word = syllable_count / word_count avg_words_per_sentence = word_count / sentence_count @@ -448,6 +449,113 @@ def average_sentence_length(raw_text: Sequence[str]) -> List[float]: return result +def count_unique_urls(raw_text: Sequence[str]) -> List[str]: + """Return a list of integers denoting the number of unique URLS per text sample.""" + url_pattern = r'https?:\/\/(?:[-\w.]|(?:%[\da-fA-F]{2}))+' + return [len(set(re.findall(url_pattern, text))) if not pd.isna(text) else 0 for text in raw_text] + + +def count_urls(raw_text: Sequence[str]) -> List[str]: + """Return a list of integers denoting the number of URLS per text sample.""" + url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+' + return [len(re.findall(url_pattern, text)) if not pd.isna(text) else 0 for text in raw_text] + + +def count_unique_email_addresses(raw_text: Sequence[str]) -> List[str]: + """Return a list of integers denoting the number of unique email addresses per text sample.""" + email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b' + return [len(set(re.findall(email_pattern, text))) if not pd.isna(text) else 0 for text in raw_text] + + +def count_email_addresses(raw_text: Sequence[str]) -> List[str]: + """Return a list of integers denoting the number of email addresses per text sample.""" + email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b' + return [len(re.findall(email_pattern, text)) if not pd.isna(text) else 0 for text in raw_text] + + +def count_unique_syllables(raw_text: Sequence[str]) -> List[str]: + """Return a list of integers denoting the number of unique syllables per text sample.""" + if not nltk_download('punkt', quiet=True): + warnings.warn('nltk punkt not found, readability score cannot be calculated.' + ' Please check your internet connection.', UserWarning) + return [np.nan] * len(raw_text) + if not nltk_download('cmudict', quiet=True): + warnings.warn('nltk cmudict not found, readability score cannot be calculated.' + ' Please check your internet connection.', UserWarning) + return [np.nan] * len(raw_text) + result = [] + cmudict_dict = corpus.cmudict.dict() + for text in raw_text: + if not pd.isna(text): + text = remove_punctuation(text.lower()) + words = word_tokenize(text) + syllables = {word: True for word in words if word in cmudict_dict} + result.append(len(syllables)) + else: + result.append(np.nan) + return result + + +def reading_time(raw_text: Sequence[str]) -> List[str]: + """Return a list of integers denoting time in seconds to read each text sample. + + The formula is based on Demberg & Keller, 2008 where it is assumed that + reading a character taken 14.69 milliseconds on average. + """ + ms_per_char = 14.69 + result = [] + for text in raw_text: + if not pd.isna(text): + words = text.split() + nchars = map(len, words) + rt_per_word = map(lambda nchar: nchar * ms_per_char, nchars) + ms_reading_time = sum(list(rt_per_word)) + result.append(round(ms_reading_time/1000, 2)) + else: + result.append(0.00) + return result + + +def sentence_length(raw_text: Sequence[str]) -> List[str]: + """Return a list of integers denoting the number of sentences per text sample.""" + if not nltk_download('punkt', quiet=True): + warnings.warn('nltk punkt not found, average syllable length cannot be calculated.' + ' Please check your internet connection.', UserWarning) + return [np.nan] * len(raw_text) + result = [] + for text in raw_text: + if not pd.isna(text): + sentence_count = len(sent_tokenize(text)) + result.append(sentence_count) + else: + result.append(np.nan) + return result + + +def average_syllable_length(raw_text: Sequence[str]) -> List[str]: + """Return a list of integers denoting the average number of syllables per sentences per text sample.""" + if not nltk_download('punkt', quiet=True): + warnings.warn('nltk punkt not found, average syllable length cannot be calculated.' + ' Please check your internet connection.', UserWarning) + return [np.nan] * len(raw_text) + if not nltk_download('cmudict', quiet=True): + warnings.warn('nltk cmudict not found, average syllable length cannot be calculated.' + ' Please check your internet connection.', UserWarning) + return [np.nan] * len(raw_text) + cmudict_dict = corpus.cmudict.dict() + result = [] + for text in raw_text: + if not pd.isna(text): + sentence_count = len(sent_tokenize(text)) + text = remove_punctuation(text.lower()) + words = word_tokenize(text) + syllable_count = sum([len(cmudict_dict[word]) for word in words if word in cmudict_dict]) + result.append(round(syllable_count/sentence_count, 2)) + else: + result.append(np.nan) + return result + + class TextProperty(TypedDict): name: str method: Callable[..., Sequence[Any]] @@ -472,13 +580,24 @@ class TextProperty(TypedDict): ) +ALL_PROPERTIES: Tuple[TextProperty, ...] = ( + {'name': 'Count URLs', 'method': count_urls, 'output_type': 'numeric'}, + {'name': 'Count Email Address', 'method': count_email_addresses, 'output_type': 'numeric'}, + {'name': 'Count Unique URLs', 'method': count_unique_urls, 'output_type': 'numeric'}, + {'name': 'Count Unique Email Address', 'method': count_unique_email_addresses, 'output_type': 'numeric'}, + {'name': 'Count Unique Syllables', 'method': count_unique_syllables, 'output_type': 'numeric'}, + {'name': 'Reading Time', 'method': reading_time, 'output_type': 'numeric'}, + {'name': 'Sentence Length', 'method': sentence_length, 'output_type': 'numeric'}, + {'name': 'Average Syllable Length', 'method': average_syllable_length, 'output_type': 'numeric'}, +) + DEFAULT_PROPERTIES + + LONG_RUN_PROPERTIES = ('Toxicity', 'Fluency', 'Formality', 'Unique Noun Count') LARGE_SAMPLE_SIZE = 10_000 ENGLISH_ONLY_PROPERTIES = ( - 'Sentiment', 'Subjectivity', 'Toxicity', - 'Fluency', 'Formality', 'Readability Score', - 'Unique Noun Count' + 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Readability Score', + 'Unique Noun Count', 'Count Unique Syllables', 'Sentence Length', 'Average Syllable Length' ) @@ -491,15 +610,18 @@ def _select_properties( device: Optional[str] = None, ) -> Sequence[TextProperty]: """Select properties.""" - properties = DEFAULT_PROPERTIES + all_properties = ALL_PROPERTIES + default_properties = DEFAULT_PROPERTIES if include_properties is not None and ignore_properties is not None: raise ValueError('Cannot use properties and ignore_properties parameters together.') if include_properties is not None: - properties = [prop for prop in properties if prop['name'] in include_properties] + properties = [prop for prop in all_properties if prop['name'] in include_properties] elif ignore_properties is not None: - properties = [prop for prop in properties if prop['name'] not in ignore_properties] + properties = [prop for prop in default_properties if prop['name'] not in ignore_properties] + else: + properties = default_properties if not include_long_calculation_properties: return [ @@ -528,7 +650,7 @@ def _select_properties( return properties -def calculate_default_properties( +def calculate_builtin_properties( raw_text: Sequence[str], include_properties: Optional[List[str]] = None, ignore_properties: Optional[List[str]] = None, @@ -543,17 +665,25 @@ def calculate_default_properties( raw_text : Sequence[str] The text to calculate the properties for. include_properties : List[str], default None - The properties to calculate. If None, all default properties will be calculated. Cannot be used together - with ignore_properties parameter. Available properties are: + The properties to calculate. If None, all default properties will be calculated. Cannot be used + together with ignore_properties parameter. Available properties are: ['Text Length', 'Average Word Length', 'Max Word Length', '% Special Characters', 'Language', 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Lexical Density', 'Unique Noun Count', - 'Readability Score', 'Average Sentence Length'] + 'Readability Score', 'Average Sentence Length', 'Count URLs', Count Unique URLs', 'Count Email Address', + 'Count Unique Email Address', 'Count Unique Syllables', 'Reading Time', 'Sentence Length', + 'Average Syllable Length'] + List of default properties are: ['Text Length', 'Average Word Length', 'Max Word Length', + '% Special Characters', 'Language', 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', + 'Lexical Density', 'Unique Noun Count', 'Readability Score', 'Average Sentence Length'] + To calculate all the default properties, the include_properties and ignore_properties parameters should + be None. If you pass either include_properties or ignore_properties then the only the properties specified + in the list will be calculated or ignored. Note that the properties ['Toxicity', 'Fluency', 'Formality', 'Language', 'Unique Noun Count'] may take a long time to calculate. If include_long_calculation_properties is False, these properties will be ignored, even if they are in the include_properties parameter. ignore_properties : List[str], default None - The properties to ignore. If None, no properties will be ignored. Cannot be used together with - properties parameter. + The properties to ignore from the list of default properties. If None, no properties will be ignored and + all the default properties will be calculated. Cannot be used together with include_properties parameter. include_long_calculation_properties : bool, default False Whether to include properties that may take a long time to calculate. If False, these properties will be ignored, even if they are in the include_properties parameter. diff --git a/docs/source/checks/nlp/train_test_validation/plot_property_drift.py b/docs/source/checks/nlp/train_test_validation/plot_property_drift.py index bb2359a0bd..d10775f1d1 100644 --- a/docs/source/checks/nlp/train_test_validation/plot_property_drift.py +++ b/docs/source/checks/nlp/train_test_validation/plot_property_drift.py @@ -54,8 +54,8 @@ train_dataset, test_dataset = load_data() # # Calculate properties, commented out because it takes a short while to run -# train_dataset.calculate_default_properties(include_long_calculation_properties=True) -# test_dataset.calculate_default_properties(include_long_calculation_properties=True) +# train_dataset.calculate_builtin_properties(include_long_calculation_properties=True) +# test_dataset.calculate_builtin_properties(include_long_calculation_properties=True) #%% # Run the check diff --git a/docs/source/nlp/usage_guides/nlp_properties.rst b/docs/source/nlp/usage_guides/nlp_properties.rst index f1afa8c760..c9f562529f 100644 --- a/docs/source/nlp/usage_guides/nlp_properties.rst +++ b/docs/source/nlp/usage_guides/nlp_properties.rst @@ -42,26 +42,39 @@ Deepchecks' Built-in Properties =============================== You can either use the built-in properties or implement your own ones and pass them to the relevant checks. +There are two types of built-in properties: + +#. Default properties: These properties are caclulated when you do not specify any properties to calculate or ignore. +#. Non-default properties: These properties are only caclulated when you specify them using the ``include_properties`` argument. + The built-in image properties are: -============================== ========== -Property name What is it -============================== ========== -Text Length Number of characters in the text -Average Word Length Average number of characters in a word -Max Word Length Maximum number of characters in a word -% Special Characters Percentage of special characters in the text -Language Language of the text. Uses the langdetect library -Sentiment Sentiment of the text. Uses the textblob library -Subjectivity Subjectivity of the text. Uses the textblob library -Toxicity* Toxicity of the text. Uses the unitary/toxic-bert model -Fluency* Fluency of the text. Uses the prithivida/parrot_fluency_model model -Formality* Formality of the text. Uses the s-nlp/roberta-base-formality-ranker model -Lexical Density Percentage of unique words in the text, rounded up to 2 decimal digits -Unique Noun Count* Number of unique noun words in the text -Readability Score A score calculated based on Flesch reading-ease per text sample. For more information: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease -Average Sentence Length Average number of words per sentence in the text -============================== ========== +============================== ================ ========== +Property name Default Property Description +============================== ================ ========== +Text Length Yes Number of characters in the text +Average Word Length Yes Average number of characters in a word +Max Word Length Yes Maximum number of characters in a word +% Special Characters Yes Percentage of special characters in the text +Language Yes Language of the text. Uses the langdetect library +Sentiment Yes Sentiment of the text. Uses the textblob library +Subjectivity Yes Subjectivity of the text. Uses the textblob library +Toxicity* Yes Toxicity of the text. Uses the unitary/toxic-bert model +Fluency* Yes Fluency of the text. Uses the prithivida/parrot_fluency_model model +Formality* Yes Formality of the text. Uses the s-nlp/roberta-base-formality-ranker model +Lexical Density Yes Percentage of unique words in the text, rounded up to 2 decimal digits +Unique Noun Count* Yes Number of unique noun words in the text +Readability Score Yes A score calculated based on Flesch reading-ease per text sample. For more information: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease +Average Sentence Length Yes Average number of words per sentence in the text +Count URLs No Number of URLS per text sample. +Count Unique URLs No Number of unique URLS per text sample. +Count Email Address No Number of email addresses per text sample. +Count Unique Email Address No Number of unique email addresses per text sample. +Count Unique Syllables No Number of unique syllables per text sample. +Reading Time No Time taken in seconds to read a text sample. +Sentence Length No Number of sentences per text sample. +Average Syllable Length No Average number of syllables per sentence per text sample. +============================== ================ ========== *These properties are not calculated by default, as they may take a long time to calculate. To use them, pass ``include_long_calculation_properties=True`` to the :class:`TextData.calculate_properties ` method. @@ -77,8 +90,11 @@ In order to use the properties of your text in a check, the properties should al Calculating The Built-in Properties ----------------------------------- -In order to use the built-in properties, you must call the ``calculate_default_properties`` method of the ``TextData`` +In order to use the built-in properties, you must call the ``calculate_builtin_properties`` method of the ``TextData`` object. This method will calculate the properties and add them to the :class:`TextData ` object. +To calculate all the default properties, you do not need to pass the ``include_properties`` parameter in the +``calculate_builtin_properties`` function. If you pass either ``include_properties`` or ``ignore_properties`` parameter +then the only the properties specified will be calculated or ignored. Example of calculating the built-in properties in order to use the TextPropertyOutliers check: In the following example, we will calculate the default properties in order to use the TextPropertyOutliers check: @@ -92,36 +108,39 @@ In the following example, we will calculate the default properties in order to u text_data = TextData(text) # Calculate the default properties - text_data.calculate_default_properties() + text_data.calculate_builtin_properties() # Run the check TextPropertyOutliers().run(text_data) -Note that any use of the ``TextData.calculate_default_properties`` method will override the existing properties. +Note that any use of the ``TextData.calculate_builtin_properties`` method will override the existing properties. Including or Ignoring Properties ################################# When calculating the properties, you can choose to include or exclude specific properties, by passing the -``include_properties`` or ``ignore_properties`` parameters to the ``calculate_default_properties`` method. +``include_properties`` or ``ignore_properties`` parameters to the ``calculate_builtin_properties`` method. The parameters should be a list of the names of the properties to include or ignore. Note that only one of the parameters can be passed to the method. -In the following example, we will calculate the built-in properties and ignore the ``Text Length`` property: +In the following example, we will calculate the built-in properties (both default and non-default) and ignore the +``Text Length`` property: .. code-block:: python - text_data.calculate_default_properties(ignore_properties=['Text Length']) + text_data.calculate_builtin_properties(ignore_properties=['Text Length']) -Moreover, some properties are not calculated by default, as they may take a long time to calculate. In order to -use them, pass ``include_long_calculation_properties`` to the ``calculate_default_properties`` method. +Note that in the example above, we specified the ``Text Length`` property to be ignored and hence all other built-in +default properties will be calculated except for ``Text Length`` property. Moreover, some properties are not calculated by default, as they may take a +long time to calculate. In order to use them, pass ``include_long_calculation_properties`` to the +``calculate_builtin_properties`` method. In the following example, we will calculate the properties and include only the long calculation property "Toxicity": .. code-block:: python - text_data.calculate_default_properties(include_long_calculation_properties=True, include_properties=['Toxicity']) + text_data.calculate_builtin_properties(include_long_calculation_properties=True, include_properties=['Toxicity']) Saving The Calculated Properties ################################ diff --git a/docs/source/nlp/usage_guides/text_data_object.rst b/docs/source/nlp/usage_guides/text_data_object.rst index 172453371a..ee05c2ff21 100644 --- a/docs/source/nlp/usage_guides/text_data_object.rst +++ b/docs/source/nlp/usage_guides/text_data_object.rst @@ -69,9 +69,11 @@ Useful Functions Calculate Default Properties ----------------------------- -You can calculate the default text properties for the TextData object: +To calculate all the default properties, you do not need to pass the ``include_properties`` parameter in the +``calculate_builtin_properties`` function. If you pass either ``include_properties`` or ``ignore_properties`` parameter +then only the properties specified will be calculated or ignored. You can calculate the default text properties for the TextData object using: ->>> text_data.calculate_default_properties() +>>> text_data.calculate_builtin_properties() To learn more about how deepchecks uses properties and how you can calculate or set them yourself, see the :ref:`Text Properties Guide `. diff --git a/tests/nlp/checks/data_integrity/under_annotated_segments_test.py b/tests/nlp/checks/data_integrity/under_annotated_segments_test.py index f5a45657c1..b2c8fcae11 100644 --- a/tests/nlp/checks/data_integrity/under_annotated_segments_test.py +++ b/tests/nlp/checks/data_integrity/under_annotated_segments_test.py @@ -103,7 +103,7 @@ def test_token_classification_dataset(small_wikiann_train_test_text_data): data, _ = small_wikiann_train_test_text_data data = data.copy() data._label = np.asarray(list(data._label[:40]) + [None] * 10, dtype=object) - data.calculate_default_properties(include_long_calculation_properties=False) + data.calculate_builtin_properties(include_long_calculation_properties=False) check = UnderAnnotatedPropertySegments().add_condition_segments_relative_performance_greater_than() # Act diff --git a/tests/nlp/checks/model_evaluation/weak_segment_performance_test.py b/tests/nlp/checks/model_evaluation/weak_segment_performance_test.py index baf4fc78db..0da9e03d55 100644 --- a/tests/nlp/checks/model_evaluation/weak_segment_performance_test.py +++ b/tests/nlp/checks/model_evaluation/weak_segment_performance_test.py @@ -10,7 +10,7 @@ # """Test for the NLP WeakSegmentsPerformance check""" import pytest -from hamcrest import assert_that, close_to, equal_to, has_items, is_in +from hamcrest import assert_that, close_to, equal_to, has_items, is_in, matches_regexp from deepchecks.nlp.checks import MetadataSegmentsPerformance, PropertySegmentsPerformance from tests.base.utils import equal_condition_result @@ -28,8 +28,10 @@ def test_tweet_emotion(tweet_emotion_train_test_textdata, tweet_emotion_train_te # Assert assert_that(condition_result, has_items( equal_condition_result(is_pass=False, - details='Found a segment with accuracy score of 0.305 in comparison to an average score of 0.708 in sampled data.', - name='The relative performance of weakest segment is greater than 80% of average model performance.') + details='Found a segment with accuracy score of 0.305 in comparison ' + 'to an average score of 0.708 in sampled data.', + name='The relative performance of weakest segment is greater than ' + '80% of average model performance.') )) assert_that(result.value['avg_score'], close_to(0.708, 0.001)) @@ -49,8 +51,10 @@ def test_tweet_emotion_properties(tweet_emotion_train_test_textdata, tweet_emoti # Assert assert_that(condition_result, has_items( equal_condition_result(is_pass=True, - details='Found a segment with accuracy score of 0.525 in comparison to an average score of 0.708 in sampled data.', - name='The relative performance of weakest segment is greater than 70% of average model performance.') + details='Found a segment with accuracy score of 0.525 in comparison to an average ' + 'score of 0.708 in sampled data.', + name='The relative performance of weakest segment is greater than 70% of average ' + 'model performance.') )) assert_that(result.value['avg_score'], close_to(0.708, 0.001)) @@ -69,10 +73,10 @@ def test_warning_of_n_top_columns(tweet_emotion_train_test_textdata, tweet_emoti 'n_top_properties to None. Alternatively, you can set parameter properties to a list of the ' \ 'specific properties you want to run on.' - metadata_warning = 'Parameter n_top_columns is set to 2 to avoid long computation time. This means that the check ' \ - 'will run on 2 metadata columns selected at random. If you want to run on all metadata columns, set ' \ - 'n_top_columns to None. Alternatively, you can set parameter columns to a list of the specific ' \ - 'metadata columns you want to run on.' + metadata_warning = 'Parameter n_top_columns is set to 2 to avoid long computation time. This means that the ' \ + 'check will run on 2 metadata columns selected at random. If you want to run on all metadata ' \ + 'columns, set n_top_columns to None. Alternatively, you can set parameter columns to a list ' \ + 'of the specific metadata columns you want to run on.' # Assert with pytest.warns(UserWarning, match=property_warning): @@ -91,13 +95,21 @@ def test_multilabel_dataset(multilabel_mock_dataset_and_probabilities): condition_result = check.conditions_decision(result) # Assert - assert_that(condition_result, has_items( - equal_condition_result(is_pass=True, - details='Found a segment with f1 macro score of 0.695 in comparison to an average ' - 'score of 0.83 in sampled data.', - name='The relative performance of weakest segment is greater ' - 'than 80% of average model performance.') - )) + # TODO: Check why the details is not consistent + # assert_that(condition_result, has_items( + # equal_condition_result(is_pass=True, + # details='Found a segment with f1 macro score of 0.712 in comparison to an average ' + # 'score of 0.83 in sampled data.', + # name='The relative performance of weakest segment is greater ' + # 'than 80% of average model performance.') + # )) + # TODO: Remove once details becomes consistent + pat = r'Found a segment with f1 macro score of \d+.\d+ in comparison to an average score of 0.83 in sampled data.' + assert_that(condition_result[0].details, matches_regexp(pat)) + assert_that(condition_result[0].name, equal_to('The relative performance ' + 'of weakest segment is greater ' + 'than 80% of average model ' + 'performance.')) assert_that(result.value['avg_score'], close_to(0.83, 0.001)) assert_that(len(result.value['weak_segments_list']), is_in([5, 6])) # TODO: check why it's not always 5 diff --git a/tests/nlp/checks/train_test_validation/property_drift_test.py b/tests/nlp/checks/train_test_validation/property_drift_test.py index 9f76d97f35..81998b7473 100644 --- a/tests/nlp/checks/train_test_validation/property_drift_test.py +++ b/tests/nlp/checks/train_test_validation/property_drift_test.py @@ -50,8 +50,8 @@ def test_with_drift(self, tweet_emotion_train_test_textdata): train = train.sample(30, random_state=0) test = test.sample(30, random_state=0) - train.calculate_default_properties() - test.calculate_default_properties() + train.calculate_builtin_properties() + test.calculate_builtin_properties() check = PropertyDrift(min_samples=20).add_condition_drift_score_less_than() @@ -95,7 +95,7 @@ class TestTokenClassification: def test_without_drift(self, small_wikiann_train_test_text_data): # Arrange train, _ = small_wikiann_train_test_text_data - train.calculate_default_properties() + train.calculate_builtin_properties() check = PropertyDrift(min_samples=20).add_condition_drift_score_less_than() # Act result = check.run(train_dataset=train, test_dataset=train) @@ -117,10 +117,10 @@ def test_with_drift(self, small_wikiann_train_test_text_data): # Arrange train, test = small_wikiann_train_test_text_data - train.calculate_default_properties( + train.calculate_builtin_properties( include_long_calculation_properties=False ) - test.calculate_default_properties( + test.calculate_builtin_properties( include_long_calculation_properties=False ) @@ -149,7 +149,7 @@ class TestMultiLabelClassification: def test_without_drift(self, dummy_multilabel_textdata_train_test): # Arrange train, _ = dummy_multilabel_textdata_train_test - train.calculate_default_properties() + train.calculate_builtin_properties() check = PropertyDrift(min_samples=20).add_condition_drift_score_less_than() # Act result = check.run(train_dataset=train, test_dataset=train) @@ -170,15 +170,12 @@ def test_without_drift(self, dummy_multilabel_textdata_train_test): def test_with_drift(self, dummy_multilabel_textdata_train_test): # Arrange train, test = dummy_multilabel_textdata_train_test - properties_to_ignore = ['Lexical Density','Unique Noun Count', 'Average Sentence Length', 'Readability Score'] - train.calculate_default_properties(ignore_properties=properties_to_ignore) - test.calculate_default_properties(ignore_properties=properties_to_ignore) - - check = PropertyDrift(min_samples=20).add_condition_drift_score_less_than( - max_allowed_numeric_score=0.3, - max_allowed_categorical_score=0.3 - ) - + default_properties = ['Text Length', 'Average Word Length', 'Max Word Length', '% Special Characters', + 'Language', 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', ] + train.calculate_builtin_properties(include_properties=default_properties) + test.calculate_builtin_properties(include_properties=default_properties) + check = PropertyDrift(min_samples=20).add_condition_drift_score_less_than(max_allowed_numeric_score=0.3, + max_allowed_categorical_score=0.3) # Act result = check.run(train_dataset=train, test_dataset=test) condition_results = check.conditions_decision(result) diff --git a/tests/nlp/test_text_data.py b/tests/nlp/test_text_data.py index 799a0eaca7..ecc7e27800 100644 --- a/tests/nlp/test_text_data.py +++ b/tests/nlp/test_text_data.py @@ -16,7 +16,6 @@ from deepchecks.core.errors import DeepchecksValueError from deepchecks.nlp.text_data import TextData -from deepchecks.nlp.utils.text_properties import LONG_RUN_PROPERTIES def test_text_data_init(): @@ -184,7 +183,7 @@ def test_properties(text_classification_dataset_mock): # Act & Assert assert_that(dataset._properties, equal_to(None)) # TODO: Create test for the heavy properties - dataset.calculate_default_properties(ignore_properties=['topic', *LONG_RUN_PROPERTIES]) + dataset.calculate_builtin_properties(include_long_calculation_properties = False) properties = dataset.properties assert_that(properties.shape[0], equal_to(3)) assert_that(properties.shape[1], equal_to(10)) diff --git a/tests/nlp/utils/test_properties.py b/tests/nlp/utils/test_properties.py index 0eb1c9833f..3819a9ae53 100644 --- a/tests/nlp/utils/test_properties.py +++ b/tests/nlp/utils/test_properties.py @@ -18,14 +18,41 @@ import pytest from hamcrest import * -from deepchecks.nlp.text_data import TextData -from deepchecks.nlp.utils.text_properties import MODELS_STORAGE, calculate_default_properties, get_transformer_model +from deepchecks.nlp.utils.text_properties import MODELS_STORAGE, calculate_builtin_properties, get_transformer_model def mock_fn(*args, **kwargs): # pylint: disable=unused-argument return [0] * 20_000 +@pytest.fixture(name='manual_text_data_for_properties') +def text_data_fixture(): + """Mock data for a calculating text properties.""" + text_data = { + 'url_data': [ + 'Please contact me at abc.ex@example.com.', + 'For more information, visit our website: https://deepchecks.com/.', + 'Email us at info@example.com or visit our website http://www.example.com for assistance.', + 'For any inquiries, send an email to support@example.com.', + 'The results were found at http://www.google.com and it redirects to' + 'https://www.deepchecks.com and there we can find the links to all social medias such' + 'as http://gmail.com, https://fb.com, https://www.deepchecks.com, https://www.google.com' + ], + 'email_data': [ + 'Please send your inquiries to info@example.com or support@example.com. We are happy to assist you.', + 'Contact us at john.doe@example.com or jane.smith@example.com for further information\ + Looking forward to hearing from you.', + 'For any questions or concerns, email sales@example.com. We are here to help.', + 'Hello, this is a text without email address@asx', + 'You can contact me directly at samantha.wilson@example.com or use the\ + team email address marketing@example.com.', + 'If you have any feedback or suggestions, feel free to email us at feedback@example.com,\ + support@example.com, feedback@example.com.' + ] + } + return text_data + + @patch('deepchecks.nlp.utils.text_properties.run_available_kwargs', mock_fn) def test_that_warning_is_shown_for_big_datasets(): # Arrange @@ -38,7 +65,7 @@ def test_that_warning_is_shown_for_big_datasets(): # Act with pytest.warns(UserWarning, match=match_text): - result = calculate_default_properties(raw_text, include_properties=['Toxicity'], + result = calculate_builtin_properties(raw_text, include_properties=['Toxicity'], include_long_calculation_properties=True)[0] # Assert @@ -52,11 +79,12 @@ def test_calculate_lexical_density_property(tweet_emotion_train_test_textdata): test_text = test.text # Act - result = calculate_default_properties(test_text, include_properties=['Lexical Density'])[0] - result_none_text = calculate_default_properties([None], include_properties=['Lexical Density'])[0] + result = calculate_builtin_properties(test_text, include_properties=['Lexical Density'])[0] + result_none_text = calculate_builtin_properties([None], include_properties=['Lexical Density'])[0] # Assert - assert_that(result['Lexical Density'][0: 10], equal_to([94.44, 93.75, 100.0, 91.67, 87.5, 100.0, 100.0, 100.0, 91.67, 91.67])) + assert_that(result['Lexical Density'][0: 10], equal_to([94.44, 93.75, 100.0, 91.67, + 87.5, 100.0, 100.0, 100.0, 91.67, 91.67])) assert_that(result_none_text['Lexical Density'], equal_to([np.nan])) @@ -67,9 +95,9 @@ def test_calculate_unique_noun_count_property(tweet_emotion_train_test_textdata) test_text = test.text # Act - result = calculate_default_properties(test_text, include_properties=['Unique Noun Count'], + result = calculate_builtin_properties(test_text, include_properties=['Unique Noun Count'], include_long_calculation_properties=True)[0] - result_none_text = calculate_default_properties([None], include_properties=['Unique Noun Count'], + result_none_text = calculate_builtin_properties([None], include_properties=['Unique Noun Count'], include_long_calculation_properties=True)[0] # Assert @@ -83,8 +111,8 @@ def test_calculate_average_sentence_length_property(tweet_emotion_train_test_tex test_text = test.text # Act - result = calculate_default_properties(test_text, include_properties=['Average Sentence Length'])[0] - result_none_text = calculate_default_properties([None], include_properties=['Average Sentence Length'])[0] + result = calculate_builtin_properties(test_text, include_properties=['Average Sentence Length'])[0] + result_none_text = calculate_builtin_properties([None], include_properties=['Average Sentence Length'])[0] # Assert assert_that(result['Average Sentence Length'][0: 10], equal_to([6, 7, 11, 12, 8, 19, 3, 9, 12, 7])) @@ -92,21 +120,155 @@ def test_calculate_average_sentence_length_property(tweet_emotion_train_test_tex def test_calculate_readability_score_property(tweet_emotion_train_test_textdata): + # Arrange _, test = tweet_emotion_train_test_textdata test_text = test.text # Act - result = calculate_default_properties(test_text, include_properties=['Readability Score'])[0] - result_none_text = calculate_default_properties([None], include_properties=['Readability Score'])[0] + result = calculate_builtin_properties(test_text, include_properties=['Readability Score'])[0] + result_none_text = calculate_builtin_properties([None], include_properties=['Readability Score'])[0] # Assert - assert_that(result['Readability Score'][0: 10], equal_to([ - 102.045, 97.001, 80.306, 67.755, 77.103, 71.782, np.nan, 75.5, 70.102, 95.564 - ])) + assert_that(result['Readability Score'][0: 10], equal_to([102.045, 97.001, 80.306, 67.755, 77.103, + 71.782, np.nan, 75.5, 70.102, 95.564])) assert_that(result_none_text['Readability Score'], equal_to([np.nan])) +def test_calculate_count_unique_urls(manual_text_data_for_properties): + + # Arrange + text_data = manual_text_data_for_properties['url_data'] + + # Act + result = calculate_builtin_properties(text_data, include_properties=['Count Unique URLs'])[0] + result_none_text = calculate_builtin_properties([None], include_properties=['Count Unique URLs'])[0] + + # Assert + assert_that(result['Count Unique URLs'], equal_to([0, 1, 1, 0, 5])) + assert_that(result_none_text['Count Unique URLs'], equal_to([0])) + + +def test_calculate_count_urls(manual_text_data_for_properties): + + # Arrange + text_data = manual_text_data_for_properties['url_data'] + + # Act + result = calculate_builtin_properties(text_data, include_properties=['Count URLs'])[0] + result_none_text = calculate_builtin_properties([None], include_properties=['Count URLs'])[0] + + # Assert + assert_that(result['Count URLs'], equal_to([0, 1, 1, 0, 6])) + assert_that(result_none_text['Count URLs'], equal_to([0])) + + +def test_calculate_count_unique_email_addresses(manual_text_data_for_properties): + + # Arrange + text_data = manual_text_data_for_properties['email_data'] + + # Act + result = calculate_builtin_properties(text_data, include_properties=['Count Unique Email Address'])[0] + result_none_text = calculate_builtin_properties([None], include_properties=['Count Unique Email Address'])[0] + + # Assert + assert_that(result['Count Unique Email Address'], equal_to([2, 2, 1, 0, 2, 2])) + assert_that(result_none_text['Count Unique Email Address'], equal_to([0])) + + +def test_calculate_count_email_addresses(manual_text_data_for_properties): + + # Arrange + text_data = manual_text_data_for_properties['email_data'] + + # Act + result = calculate_builtin_properties(text_data, include_properties=['Count Email Address'])[0] + result_none_text = calculate_builtin_properties([None], include_properties=['Count Email Address'])[0] + + # Assert + assert_that(result['Count Email Address'], equal_to([2, 2, 1, 0, 2, 3])) + assert_that(result_none_text['Count Email Address'], equal_to([0])) + + +def test_calculate_count_unique_syllables(tweet_emotion_train_test_textdata): + + # Arrange + _, test = tweet_emotion_train_test_textdata + test_text = test.text + + # Act + result = calculate_builtin_properties(test_text, include_properties=['Count Unique Syllables'])[0] + result_none_text = calculate_builtin_properties([None], include_properties=['Count Unique Syllables'])[0] + + # Assert + assert_that(result['Count Unique Syllables'][0: 10], equal_to([15, 11, 9, 21, 13, 17, np.nan, 8, 20, 18])) + assert_that(result_none_text['Count Unique Syllables'], equal_to([np.nan])) + + +def test_calculate_reading_time(tweet_emotion_train_test_textdata): + + # Arrange + _, test = tweet_emotion_train_test_textdata + test_text = test.text + + # Act + result = calculate_builtin_properties(test_text, include_properties=['Reading Time'])[0] + result_none_text = calculate_builtin_properties([None], include_properties=['Reading Time'])[0] + + # Assert + assert_that(result['Reading Time'][0: 10], equal_to([1.26, 1.25, 0.81, 1.35, 1.44, + 1.88, 0.48, 0.71, 1.53, 1.56])) + assert_that(result_none_text['Reading Time'], equal_to([0.00])) + + +def test_calculate_sentence_length(tweet_emotion_train_test_textdata): + + # Arrange + _, test = tweet_emotion_train_test_textdata + test_text = test.text + + # Act + result = calculate_builtin_properties(test_text, include_properties=['Sentence Length'])[0] + result_none_text = calculate_builtin_properties([None], include_properties=['Sentence Length'])[0] + + # Assert + assert_that(result['Sentence Length'][0: 10], equal_to([3, 2, 1, 2, 2, 1, np.nan, 1, 2, 3])) + assert_that(result_none_text['Sentence Length'], equal_to([np.nan])) + + +def test_calculate_average_syllable_count(tweet_emotion_train_test_textdata): + + # Arrange + _, test = tweet_emotion_train_test_textdata + test_text = test.text + + # Act + result = calculate_builtin_properties(test_text, include_properties=['Average Syllable Length'])[0] + result_none_text = calculate_builtin_properties([None], include_properties=['Average Syllable Length'])[0] + + # Assert + assert_that(result['Average Syllable Length'][0: 10], equal_to([7.0, 8.5, 15.0, 18.0, 11.5, + 26.0, np.nan, 13.0, 17.0, 9.0])) + assert_that(result_none_text['Average Syllable Length'], equal_to([np.nan])) + + +def test_ignore_properties(): + + # Arrange + test_text = ['This is simple sentence.'] + expected_properties = ['Text Length', 'Average Word Length', 'Max Word Length', + '% Special Characters', 'Language','Sentiment', 'Subjectivity', + 'Lexical Density', 'Readability Score', 'Average Sentence Length'] + # Act + result = calculate_builtin_properties(test_text, ignore_properties=['Unique Noun Count', + 'Toxicity', 'Fluency', + 'Formality'])[0] + # Assert + for prop in result: + assert_that(expected_properties, has_item(prop)) + + @pytest.mark.skipif( 'TEST_NLP_PROPERTIES_MODELS_DOWNLOAD' not in os.environ, reason='The test takes too long to run, provide env var if you want to run it.' @@ -181,7 +343,7 @@ def test_english_only_properties_calculation_with_not_english_samples(): 'London is the capital of Great Britain' ] # Act - properties, properties_types = calculate_default_properties( + properties, properties_types = calculate_builtin_properties( raw_text=text, include_properties=['Sentiment', 'Language', 'Text Length'] ) From fbecc3e868560cf714005a350c2e5b9cc0fdd69a Mon Sep 17 00:00:00 2001 From: Noam Bressler Date: Sun, 21 May 2023 13:16:08 +0300 Subject: [PATCH 20/20] fix reference to nlp in vision (#2545) --- docs/source/vision/index.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/vision/index.rst b/docs/source/vision/index.rst index 9d3ffc3a29..1116945495 100644 --- a/docs/source/vision/index.rst +++ b/docs/source/vision/index.rst @@ -29,7 +29,7 @@ This section contain tutorials for different use cases, and how to use deepcheck Usage Guides ------------ -This section contain in-depth guides on different aspects and components of the deepchecks nlp sub package. +This section contain in-depth guides on different aspects and components of the deepchecks vision sub package. .. toctree:: :titlesonly: @@ -45,7 +45,7 @@ This section contain in-depth guides on different aspects and components of the Checks Gallery -------------- -Below in the full list of checks and suites available in the deepchecks nlp sub package. +Below in the full list of checks and suites available in the deepchecks vision sub package. .. toctree:: :titlesonly: