make release-tag: Merge branch 'master' into stable

sdv-dev · Aug 23, 2023 · 0e1a1db · 0e1a1db
2 parents 57a5e29 + 7001454
commit 0e1a1db
Show file tree

Hide file tree

Showing 31 changed files with 1,127 additions and 761 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,5 +1,28 @@
 # Release Notes
 
+## 1.4.0 - 2023-08-23
+
+This release makes multiple improvements to the metadata. Both the single and multi table metadata classes now have a `validate_data` method. This method runs checks to validate the data against the current specifications in the metadata. The `SingleTableMetadata.visualize` is also improved. The sequence index is now shown in the same section as the sequence key. It also now shows all key and index information (eg. sequence key, primary key, sequence index) in one section.
+
+The `CTGANSynthesizer` has been made more efficient in the following ways:
+1. Boolean columns are now being skipped during `preprocess` like categorial columns are.
+2. It is possible to apply other transformations to categorical columns and have `CTGAN` skip the one-hot encoding step.
+
+Additional changes include that the columns labeled with the sdtype `id` will now go through the `IDGenerator` transformer by default and constraint transformations that were being overwritten during sampling will now be respected.
+
+### New Features
+
+* Add validate_data method to Metadata - Issue [#1518](https://github.com/sdv-dev/SDV/issues/1518) by @fealho
+* Use IDGenerator for ID columns - Issue [#1519](https://github.com/sdv-dev/SDV/issues/1519) by @frances-h
+* Metadata visualization for sequential data: Only create 2 sections - Issue [#1543](https://github.com/sdv-dev/SDV/issues/1543) by @frances-h
+
+### Bugs Fixed
+
+* Inefficient CTGAN modeling when adding categorical transformers - Issue [#1450](https://github.com/sdv-dev/SDV/issues/1450) by @fealho
+* CTGANSynthesizer is assigning LabelEncoder to boolean columns (instead of None) - Issue [#1530](https://github.com/sdv-dev/SDV/issues/1530) by @fealho
+* Metadata visualization for sequential data: Missing sequence index - Issue [#1542](https://github.com/sdv-dev/SDV/issues/1542) by @frances-h
+* Constraint outputs are being overwritten in DataProcessor.reverse_transform - Issue [#1551](https://github.com/sdv-dev/SDV/issues/1551) by @amontanez24
+
 ## 1.3.0 - 2023-08-14
 
 This release adds two new methods to the `MultiTableMetadata`: `detect_from_csvs` and `detect_From_dataframes`. These methods allow you to detect metadata for a whole dataset at once by either loading them from a folder or a dictionary mapping table names to the `pandas.DataFrames`. The `SingleTableMetadata` can now be visualized! Additionally, there is now a `summarized` option in the `show_table_details` parameter of the `visualize` methods. This will print each sdtype in the table and the number of columns that have that sdtype.

diff --git a/sdv/__init__.py b/sdv/__init__.py
@@ -6,7 +6,7 @@
 
 __author__ = 'DataCebo, Inc.'
 __email__ = 'info@sdv.dev'
-__version__ = '1.3.0'
+__version__ = '1.4.0.dev2'
 
 
 import sys

diff --git a/sdv/constraints/base.py b/sdv/constraints/base.py
@@ -14,7 +14,7 @@
 from sdv.constraints.errors import (
     AggregateConstraintsError, ConstraintMetadataError, MissingConstraintColumnError)
 from sdv.errors import ConstraintsNotMetError
-from sdv.utils import groupby_list
+from sdv.utils import format_invalid_values_string, groupby_list
 
 LOGGER = logging.getLogger(__name__)
 
@@ -201,12 +201,11 @@ def _validate_data_meets_constraint(self, table_data):
             if not is_valid_data.all():
                 constraint_data = table_data[list(self.constraint_columns)]
                 invalid_rows = constraint_data[~is_valid_data]
+                invalid_rows_str = format_invalid_values_string(invalid_rows, 5)
                 err_msg = (
                     f"Data is not valid for the '{self.__class__.__name__}' constraint:\n"
-                    f'{invalid_rows[:5]}'
+                    f'{invalid_rows_str}'
                 )
-                if len(invalid_rows) > 5:
-                    err_msg += f'\n+{len(invalid_rows) - 5} more'
 
                 raise ConstraintsNotMetError(err_msg)
 

diff --git a/sdv/data_processing/data_processor.py b/sdv/data_processing/data_processor.py
@@ -9,7 +9,7 @@
 import pandas as pd
 import rdt
 from pandas.api.types import is_float_dtype, is_integer_dtype
-from rdt.transformers import AnonymizedFaker, RegexGenerator, get_default_transformers
+from rdt.transformers import AnonymizedFaker, IDGenerator, RegexGenerator, get_default_transformers
 
 from sdv.constraints import Constraint
 from sdv.constraints.base import get_subclasses
@@ -458,13 +458,30 @@ def _create_config(self, data, columns_created_by_constraints):
 
             if sdtype == 'id':
                 is_numeric = pd.api.types.is_numeric_dtype(data[column].dtype)
-                transformers[column] = self.create_regex_generator(
-                    column,
-                    sdtype,
-                    column_metadata,
-                    is_numeric
-                )
-                sdtypes[column] = 'text'
+                if column_metadata.get('regex_format', False):
+                    transformers[column] = self.create_regex_generator(
+                        column,
+                        sdtype,
+                        column_metadata,
+                        is_numeric
+                    )
+                    sdtypes[column] = 'text'
+
+                elif column in self._keys:
+                    prefix = None
+                    if not is_numeric:
+                        prefix = 'sdv-id-'
+
+                    transformers[column] = IDGenerator(prefix=prefix)
+                    sdtypes[column] = 'text'
+
+                else:
+                    transformers[column] = AnonymizedFaker(
+                        provider_name=None,
+                        function_name='bothify',
+                        function_kwargs={'text': '#####'}
+                    )
+                    sdtypes[column] = 'pii'
 
             elif pii:
                 enforce_uniqueness = bool(column in self._keys)
@@ -736,12 +753,7 @@ def reverse_transform(self, data, reset_keys=False):
             if column in sampled_columns
         ]
         for column_name in sampled_columns:
-            if column_name in missing_columns:
-                column_data = anonymized_data[column_name]
-            elif column_name in self._keys:
-                column_data = generated_keys[column_name]
-            else:
-                column_data = reversed_data[column_name]
+            column_data = reversed_data[column_name]
 
             dtype = self._dtypes[column_name]
             if is_integer_dtype(dtype) and is_float_dtype(column_data.dtype):

diff --git a/sdv/errors.py b/sdv/errors.py
@@ -23,3 +23,16 @@ class SamplingError(Exception):
 
 class NonParametricError(Exception):
     """Exception to indicate that a model is not parametric."""
+
+
+class InvalidDataError(Exception):
+    """Error to raise when data is not valid."""
+
+    def __init__(self, errors):
+        self.errors = errors
+
+    def __str__(self):
+        return (
+            'The provided data does not match the metadata:\n' +
+            '\n\n'.join(map(str, self.errors))
+        )
diff --git a/sdv/metadata/multi_table.py b/sdv/metadata/multi_table.py
@@ -9,6 +9,7 @@
 
 import pandas as pd
 
+from sdv.errors import InvalidDataError
 from sdv.metadata.errors import InvalidMetadataError
 from sdv.metadata.metadata_upgrader import convert_metadata
 from sdv.metadata.single_table import SingleTableMetadata
@@ -532,6 +533,89 @@ def validate(self):
                 'The metadata is not valid' + '\n'.join(str(e) for e in errors)
             )
 
+    def _validate_missing_tables(self, data):
+        """Validate the data doesn't have all the columns in the metadata."""
+        errors = []
+        missing_tables = set(self.tables) - set(data)
+        if missing_tables:
+            errors.append(f'The provided data is missing the tables {missing_tables}.')
+
+        return errors
+
+    def _validate_all_tables(self, data):
+        """Validate every table of the data has a valid table/metadata pair."""
+        errors = []
+        for table_name, table_data in data.items():
+            try:
+                self.tables[table_name].validate_data(table_data)
+
+            except InvalidDataError as error:
+                error_msg = f"Table: '{table_name}'"
+                for _error in error.errors:
+                    error_msg += f'\nError: {_error}'
+
+                errors.append(error_msg)
+
+            except ValueError as error:
+                errors.append(str(error))
+
+            except KeyError:
+                continue
+
+        return errors
+
+    def _validate_foreign_keys(self, data):
+        """Validate all foreign key relationships."""
+        error_msg = None
+        errors = []
+        for relation in self.relationships:
+            child_table = data.get(relation['child_table_name'])
+            parent_table = data.get(relation['parent_table_name'])
+
+            if isinstance(child_table, pd.DataFrame) and isinstance(parent_table, pd.DataFrame):
+                child_column = child_table[relation['child_foreign_key']]
+                parent_column = parent_table[relation['parent_primary_key']]
+                missing_values = child_column[~child_column.isin(parent_column)].unique()
+
+                if any(missing_values):
+                    message = ', '.join(missing_values[:5].astype(str))
+                    if len(missing_values) > 5:
+                        message = f'({message}, + more)'
+                    else:
+                        message = f'({message})'
+
+                    errors.append(
+                        f"Error: foreign key column '{relation['child_foreign_key']}' contains "
+                        f'unknown references: {message}. All the values in this column must '
+                        'reference a primary key.'
+                    )
+
+            if errors:
+                error_msg = 'Relationships:\n'
+                error_msg += '\n'.join(errors)
+
+        return [error_msg] if error_msg else []
+
+    def validate_data(self, data):
+        """Validate the data matches the metadata.
+
+        Checks the following rules:
+            * all tables of the metadata are present in the data
+            * every table of the data satisfies its own metadata
+            * all foreign keys belong to a primay key
+
+        Args:
+            data (pd.DataFrame):
+                The data to validate.
+        """
+        errors = []
+        errors += self._validate_missing_tables(data)
+        errors += self._validate_all_tables(data)
+        errors += self._validate_foreign_keys(data)
+
+        if errors:
+            raise InvalidDataError(errors)
+
     def add_table(self, table_name):
         """Add a table to the metadata.