Skip to content

Commit

Permalink
make release-tag: Merge branch 'master' into stable
Browse files Browse the repository at this point in the history
  • Loading branch information
amontanez24 committed Aug 23, 2023
2 parents 57a5e29 + 7001454 commit 0e1a1db
Show file tree
Hide file tree
Showing 31 changed files with 1,127 additions and 761 deletions.
23 changes: 23 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,28 @@
# Release Notes

## 1.4.0 - 2023-08-23

This release makes multiple improvements to the metadata. Both the single and multi table metadata classes now have a `validate_data` method. This method runs checks to validate the data against the current specifications in the metadata. The `SingleTableMetadata.visualize` is also improved. The sequence index is now shown in the same section as the sequence key. It also now shows all key and index information (eg. sequence key, primary key, sequence index) in one section.

The `CTGANSynthesizer` has been made more efficient in the following ways:
1. Boolean columns are now being skipped during `preprocess` like categorial columns are.
2. It is possible to apply other transformations to categorical columns and have `CTGAN` skip the one-hot encoding step.

Additional changes include that the columns labeled with the sdtype `id` will now go through the `IDGenerator` transformer by default and constraint transformations that were being overwritten during sampling will now be respected.

### New Features

* Add validate_data method to Metadata - Issue [#1518](https://github.com/sdv-dev/SDV/issues/1518) by @fealho
* Use IDGenerator for ID columns - Issue [#1519](https://github.com/sdv-dev/SDV/issues/1519) by @frances-h
* Metadata visualization for sequential data: Only create 2 sections - Issue [#1543](https://github.com/sdv-dev/SDV/issues/1543) by @frances-h

### Bugs Fixed

* Inefficient CTGAN modeling when adding categorical transformers - Issue [#1450](https://github.com/sdv-dev/SDV/issues/1450) by @fealho
* CTGANSynthesizer is assigning LabelEncoder to boolean columns (instead of None) - Issue [#1530](https://github.com/sdv-dev/SDV/issues/1530) by @fealho
* Metadata visualization for sequential data: Missing sequence index - Issue [#1542](https://github.com/sdv-dev/SDV/issues/1542) by @frances-h
* Constraint outputs are being overwritten in DataProcessor.reverse_transform - Issue [#1551](https://github.com/sdv-dev/SDV/issues/1551) by @amontanez24

## 1.3.0 - 2023-08-14

This release adds two new methods to the `MultiTableMetadata`: `detect_from_csvs` and `detect_From_dataframes`. These methods allow you to detect metadata for a whole dataset at once by either loading them from a folder or a dictionary mapping table names to the `pandas.DataFrames`. The `SingleTableMetadata` can now be visualized! Additionally, there is now a `summarized` option in the `show_table_details` parameter of the `visualize` methods. This will print each sdtype in the table and the number of columns that have that sdtype.
Expand Down
2 changes: 1 addition & 1 deletion sdv/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

__author__ = 'DataCebo, Inc.'
__email__ = 'info@sdv.dev'
__version__ = '1.3.0'
__version__ = '1.4.0.dev2'


import sys
Expand Down
7 changes: 3 additions & 4 deletions sdv/constraints/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from sdv.constraints.errors import (
AggregateConstraintsError, ConstraintMetadataError, MissingConstraintColumnError)
from sdv.errors import ConstraintsNotMetError
from sdv.utils import groupby_list
from sdv.utils import format_invalid_values_string, groupby_list

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -201,12 +201,11 @@ def _validate_data_meets_constraint(self, table_data):
if not is_valid_data.all():
constraint_data = table_data[list(self.constraint_columns)]
invalid_rows = constraint_data[~is_valid_data]
invalid_rows_str = format_invalid_values_string(invalid_rows, 5)
err_msg = (
f"Data is not valid for the '{self.__class__.__name__}' constraint:\n"
f'{invalid_rows[:5]}'
f'{invalid_rows_str}'
)
if len(invalid_rows) > 5:
err_msg += f'\n+{len(invalid_rows) - 5} more'

raise ConstraintsNotMetError(err_msg)

Expand Down
40 changes: 26 additions & 14 deletions sdv/data_processing/data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import pandas as pd
import rdt
from pandas.api.types import is_float_dtype, is_integer_dtype
from rdt.transformers import AnonymizedFaker, RegexGenerator, get_default_transformers
from rdt.transformers import AnonymizedFaker, IDGenerator, RegexGenerator, get_default_transformers

from sdv.constraints import Constraint
from sdv.constraints.base import get_subclasses
Expand Down Expand Up @@ -458,13 +458,30 @@ def _create_config(self, data, columns_created_by_constraints):

if sdtype == 'id':
is_numeric = pd.api.types.is_numeric_dtype(data[column].dtype)
transformers[column] = self.create_regex_generator(
column,
sdtype,
column_metadata,
is_numeric
)
sdtypes[column] = 'text'
if column_metadata.get('regex_format', False):
transformers[column] = self.create_regex_generator(
column,
sdtype,
column_metadata,
is_numeric
)
sdtypes[column] = 'text'

elif column in self._keys:
prefix = None
if not is_numeric:
prefix = 'sdv-id-'

transformers[column] = IDGenerator(prefix=prefix)
sdtypes[column] = 'text'

else:
transformers[column] = AnonymizedFaker(
provider_name=None,
function_name='bothify',
function_kwargs={'text': '#####'}
)
sdtypes[column] = 'pii'

elif pii:
enforce_uniqueness = bool(column in self._keys)
Expand Down Expand Up @@ -736,12 +753,7 @@ def reverse_transform(self, data, reset_keys=False):
if column in sampled_columns
]
for column_name in sampled_columns:
if column_name in missing_columns:
column_data = anonymized_data[column_name]
elif column_name in self._keys:
column_data = generated_keys[column_name]
else:
column_data = reversed_data[column_name]
column_data = reversed_data[column_name]

dtype = self._dtypes[column_name]
if is_integer_dtype(dtype) and is_float_dtype(column_data.dtype):
Expand Down
13 changes: 13 additions & 0 deletions sdv/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,16 @@ class SamplingError(Exception):

class NonParametricError(Exception):
"""Exception to indicate that a model is not parametric."""


class InvalidDataError(Exception):
"""Error to raise when data is not valid."""

def __init__(self, errors):
self.errors = errors

def __str__(self):
return (
'The provided data does not match the metadata:\n' +
'\n\n'.join(map(str, self.errors))
)
84 changes: 84 additions & 0 deletions sdv/metadata/multi_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import pandas as pd

from sdv.errors import InvalidDataError
from sdv.metadata.errors import InvalidMetadataError
from sdv.metadata.metadata_upgrader import convert_metadata
from sdv.metadata.single_table import SingleTableMetadata
Expand Down Expand Up @@ -532,6 +533,89 @@ def validate(self):
'The metadata is not valid' + '\n'.join(str(e) for e in errors)
)

def _validate_missing_tables(self, data):
"""Validate the data doesn't have all the columns in the metadata."""
errors = []
missing_tables = set(self.tables) - set(data)
if missing_tables:
errors.append(f'The provided data is missing the tables {missing_tables}.')

return errors

def _validate_all_tables(self, data):
"""Validate every table of the data has a valid table/metadata pair."""
errors = []
for table_name, table_data in data.items():
try:
self.tables[table_name].validate_data(table_data)

except InvalidDataError as error:
error_msg = f"Table: '{table_name}'"
for _error in error.errors:
error_msg += f'\nError: {_error}'

errors.append(error_msg)

except ValueError as error:
errors.append(str(error))

except KeyError:
continue

return errors

def _validate_foreign_keys(self, data):
"""Validate all foreign key relationships."""
error_msg = None
errors = []
for relation in self.relationships:
child_table = data.get(relation['child_table_name'])
parent_table = data.get(relation['parent_table_name'])

if isinstance(child_table, pd.DataFrame) and isinstance(parent_table, pd.DataFrame):
child_column = child_table[relation['child_foreign_key']]
parent_column = parent_table[relation['parent_primary_key']]
missing_values = child_column[~child_column.isin(parent_column)].unique()

if any(missing_values):
message = ', '.join(missing_values[:5].astype(str))
if len(missing_values) > 5:
message = f'({message}, + more)'
else:
message = f'({message})'

errors.append(
f"Error: foreign key column '{relation['child_foreign_key']}' contains "
f'unknown references: {message}. All the values in this column must '
'reference a primary key.'
)

if errors:
error_msg = 'Relationships:\n'
error_msg += '\n'.join(errors)

return [error_msg] if error_msg else []

def validate_data(self, data):
"""Validate the data matches the metadata.
Checks the following rules:
* all tables of the metadata are present in the data
* every table of the data satisfies its own metadata
* all foreign keys belong to a primay key
Args:
data (pd.DataFrame):
The data to validate.
"""
errors = []
errors += self._validate_missing_tables(data)
errors += self._validate_all_tables(data)
errors += self._validate_foreign_keys(data)

if errors:
raise InvalidDataError(errors)

def add_table(self, table_name):
"""Add a table to the metadata.
Expand Down

0 comments on commit 0e1a1db

Please sign in to comment.