make release-tag: Merge branch 'main' into stable

sdv-dev · Apr 19, 2024 · d347efc · d347efc
2 parents 4290bc2 + 623a9e6
commit d347efc
Show file tree

Hide file tree

Showing 23 changed files with 266 additions and 90 deletions.
diff --git a/.github/workflows/dependency_checker.yml b/.github/workflows/dependency_checker.yml
@@ -1,7 +1,7 @@
 name: Dependency Checker
 on:
   schedule:
-    - cron: '0 0 * * 1-5'
+    - cron: '0 0 * * 1'
   workflow_dispatch:
 jobs:
   build:
@@ -22,6 +22,8 @@ jobs:
       with:
         token: ${{ secrets.GH_ACCESS_TOKEN }}
         commit-message: Update latest dependencies
+        author: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>"
+        committer: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>"
         title: Automated Latest Dependency Updates
         body: "This is an auto-generated PR with **latest** dependency updates."
         branch: latest-dependency-update

diff --git a/.github/workflows/static_code_analysis.yml b/.github/workflows/static_code_analysis.yml
@@ -26,6 +26,8 @@ jobs:
       with:
         token: ${{ secrets.GITHUB_TOKEN }}
         commit-message: Update static code analysis
+        author: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>"
+        committer: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>"
         title: Latest Code Analysis
         body: "This is an auto-generated PR with the **latest** code analysis results."
         branch: static-code-analysis

diff --git a/HISTORY.md b/HISTORY.md
@@ -1,5 +1,25 @@
 # Release Notes
 
+## 1.12.1 - 2024-04-19
+
+This release makes a number of changes to how id columns are generated. By default, id columns with a regex will now have their values scrambled in the output. Id columns without a regex that are numeric will be created randomly. If they're not numeric, they will have a random suffix.
+
+Additionally, improvements were made to the visibility of the `get_loss_values_plot`.
+
+### New Features
+
+* Create unique id for each synthesizer - Issue [#1902](https://github.com/sdv-dev/SDV/issues/1902) by @pvk-developer
+* Generator Discriminator Loss Chart Color Change - Issue [#1916](https://github.com/sdv-dev/SDV/issues/1916) by @lajohn4747
+* If using regex to generate values, scramble them - Issue [#1921](https://github.com/sdv-dev/SDV/issues/1921) by @amontanez24
+* When generating ids without a regex, create them randomly - Issue [#1922](https://github.com/sdv-dev/SDV/issues/1922) by @frances-h
+
+### Maintenance
+
+* Cleanup automated PR workflows - Issue [#1926](https://github.com/sdv-dev/SDV/issues/1926) by @R-Palazzo
+
+### Internal
+* Add add-on modules to sys.modules  - Issue [#1924](https://github.com/sdv-dev/SDV/issues/1924) by @amontanez24
+
 ## 1.12.0 - 2024-04-16
 
 This release adds support for Python 3.12! It also adds a number of feature improvements. It adds a `simplify_schema` utility function to the `sdv.utils.poc` module which simplifies multi-table schemas so they can be run using `HMASynthesizer`. Multi-table data dictionaries can now be saved directly to CSVs using the `sdv.datasets.local.save_csvs` utility function. Additionally, generator-discriminator loss values can now be plotted directly from CTGAN using the `get_loss_values_plot` method. This release also adds error messages when trying to load an SDV synthesizer on an older version of the SDV, or when trying to re-fit a synthesizer from an older version of the SDV.

diff --git a/Makefile b/Makefile
@@ -250,7 +250,7 @@ release: check-release bumpversion-release publish bumpversion-patch
 release-test: check-release bumpversion-release-test publish-test bumpversion-revert
 
 .PHONY: release-candidate
-release-candidate: check-main publish bumpversion-candidate
+release-candidate: check-main publish bumpversion-candidate git-push
 
 .PHONY: release-candidate-test
 release-candidate-test: check-clean check-main publish-test

diff --git a/latest_requirements.txt b/latest_requirements.txt
@@ -5,6 +5,6 @@ deepecho==0.6.0
 graphviz==0.20.3
 numpy==1.26.4
 pandas==2.2.2
-rdt==1.11.0
+rdt==1.11.1
 sdmetrics==0.14.0
 tqdm==4.66.2
diff --git a/pyproject.toml b/pyproject.toml
@@ -36,7 +36,7 @@ dependencies = [
     'copulas>=0.11.0',
     'ctgan>=0.10.0',
     'deepecho>=0.6.0',
-    'rdt>=1.11.0',
+    'rdt>=1.12.0',
     'sdmetrics>=0.14.0',
 ]
 
@@ -96,12 +96,12 @@ dev = [
     'flake8-mutable>=1.2.0,<1.3',
     'flake8-expression-complexity>=0.0.9,<0.1',
     'flake8-print>=4.0.0,<4.1',
-    'flake8-pytest-style>=1.5.0,<3',
+    'flake8-pytest-style>=2.0.0,<3',
     'flake8-quotes>=3.3.0,<4',
     'flake8-sfs>=0.0.3,<2',
     'flake8-variables-names>=0.0.4,<0.1',
     'dlint>=0.11.0,<1',
-    'isort>=4.3.4,<6',
+    'isort>=5.13.2,<6',
     'pandas-vet>=0.2.3,<2024',
     'pep8-naming>=0.12.1,<1',
     'pydocstyle>=6.1.1,<7',
@@ -154,7 +154,7 @@ namespaces = false
 version = {attr = 'sdv.__version__'}
 
 [tool.bumpversion]
-current_version = "1.12.0"
+current_version = "1.12.1.dev2"
 parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
 serialize = [
     '{major}.{minor}.{patch}.{release}{candidate}',

diff --git a/sdv/__init__.py b/sdv/__init__.py
@@ -6,23 +6,25 @@
 
 __author__ = 'DataCebo, Inc.'
 __email__ = 'info@sdv.dev'
-__version__ = '1.12.0'
+__version__ = '1.12.1.dev2'
 
 
 import sys
 import warnings
 from importlib.metadata import entry_points
 from operator import attrgetter
+from types import ModuleType
 
 from sdv import (
-    constraints, data_processing, datasets, evaluation, lite, metadata, metrics, multi_table,
+    constraints, data_processing, datasets, evaluation, io, lite, metadata, metrics, multi_table,
     sampling, sequential, single_table, version)
 
 __all__ = [
     'constraints',
     'data_processing',
     'datasets',
     'evaluation',
+    'io',
     'lite',
     'metadata',
     'metrics',
@@ -92,7 +94,7 @@ def _find_addons():
             addon = entry_point.load()
         except Exception as e:  # pylint: disable=broad-exception-caught
             msg = (
-                f'Failed to load "{entry_point.name}" from "{entry_point.module_name}" '
+                f'Failed to load "{entry_point.name}" from "{entry_point.version}" '
                 f'with error:\n{e}'
             )
             warnings.warn(msg)
@@ -105,6 +107,11 @@ def _find_addons():
             warnings.warn(msg)
             continue
 
+        if isinstance(addon, ModuleType):
+            addon_module_name = f'{addon_target.__name__}.{addon_name}'
+            if addon_module_name not in sys.modules:
+                sys.modules[addon_module_name] = addon
+
         setattr(addon_target, addon_name, addon)
 
 

diff --git a/sdv/_utils.py b/sdv/_utils.py
@@ -1,5 +1,6 @@
 """Miscellaneous utility functions."""
 import operator
+import uuid
 import warnings
 from collections import defaultdict
 from collections.abc import Iterable
@@ -388,3 +389,23 @@ def _get_root_tables(relationships):
     parent_tables = {rel['parent_table_name'] for rel in relationships}
     child_tables = {rel['child_table_name'] for rel in relationships}
     return parent_tables - child_tables
+
+
+def generate_synthesizer_id(synthesizer):
+    """Generate a unique identifier for the synthesizer instance.
+
+    This method creates a unique identifier by combining the class name, the public SDV version
+    and the last part of a UUID4 composed by 36 random characters.
+
+    Args:
+        synthesizer (BaseSynthesizer or BaseMultiTableSynthesizer):
+            An SDV model instance to check versions against.
+
+    Returns:
+        ID:
+            A unique identifier for this synthesizer.
+    """
+    class_name = synthesizer.__class__.__name__
+    synth_version = version.public
+    unique_id = ''.join(str(uuid.uuid4()).split('-'))
+    return f'{class_name}_{synth_version}_{unique_id}'
diff --git a/sdv/data_processing/data_processor.py b/sdv/data_processing/data_processor.py
@@ -9,7 +9,7 @@
 import pandas as pd
 import rdt
 from pandas.api.types import is_float_dtype, is_integer_dtype
-from rdt.transformers import AnonymizedFaker, IDGenerator, get_default_transformers
+from rdt.transformers import AnonymizedFaker, get_default_transformers
 from rdt.transformers.pii.anonymization import get_anonymized_transformer
 
 from sdv.constraints import Constraint
@@ -481,7 +481,8 @@ def create_regex_generator(self, column_name, sdtype, column_metadata, is_numeri
         regex_format = column_metadata.get('regex_format', default_regex_format)
         transformer = rdt.transformers.RegexGenerator(
             regex_format=regex_format,
-            enforce_uniqueness=(column_name in self._keys)
+            enforce_uniqueness=(column_name in self._keys),
+            generation_order='scrambled'
         )
 
         return transformer
@@ -567,21 +568,23 @@ def _create_config(self, data, columns_created_by_constraints):
                     )
                     sdtypes[column] = 'text'
 
-                elif column in self._keys:
-                    prefix = None
-                    if not is_numeric:
-                        prefix = 'sdv-id-'
+                else:
+                    bothify_format = 'sdv-id-??????'
+                    if is_numeric:
+                        bothify_format = '#########'
 
-                    transformers[column] = IDGenerator(prefix=prefix)
-                    sdtypes[column] = 'text'
+                    cardinality_rule = None
+                    if column in self._keys:
+                        cardinality_rule = 'unique'
 
-                else:
                     transformers[column] = AnonymizedFaker(
                         provider_name=None,
                         function_name='bothify',
-                        function_kwargs={'text': '#####'}
+                        function_kwargs={'text': bothify_format},
+                        cardinality_rule=cardinality_rule
                     )
-                    sdtypes[column] = 'pii'
+
+                    sdtypes[column] = 'pii' if column_metadata.get('pii') else 'text'
 
             elif sdtype == 'unknown':
                 sdtypes[column] = 'pii'

diff --git a/sdv/io/__init__.py b/sdv/io/__init__.py
@@ -0,0 +1 @@
+"""I/O module."""
diff --git a/sdv/multi_table/base.py b/sdv/multi_table/base.py
@@ -13,7 +13,8 @@
 
 from sdv import version
 from sdv._utils import (
-    _validate_foreign_keys_not_null, check_sdv_versions_and_warn, check_synthesizer_version)
+    _validate_foreign_keys_not_null, check_sdv_versions_and_warn, check_synthesizer_version,
+    generate_synthesizer_id)
 from sdv.errors import ConstraintsNotMetError, InvalidDataError, SynthesizerInputError
 from sdv.single_table.copulas import GaussianCopulaSynthesizer
 
@@ -111,6 +112,7 @@ def __init__(self, metadata, locales=['en_US'], synthesizer_kwargs=None):
         self._fitted_date = None
         self._fitted_sdv_version = None
         self._fitted_sdv_enterprise_version = None
+        self._synthesizer_id = generate_synthesizer_id(self)
 
     def _get_root_parents(self):
         """Get the set of root parents in the graph."""
@@ -604,4 +606,7 @@ def load(cls, filepath):
 
         check_synthesizer_version(synthesizer)
         check_sdv_versions_and_warn(synthesizer)
+        if getattr(synthesizer, '_synthesizer_id', None) is None:
+            synthesizer._synthesizer_id = generate_synthesizer_id(synthesizer)
+
         return synthesizer
diff --git a/sdv/single_table/base.py b/sdv/single_table/base.py
@@ -19,7 +19,8 @@
 from copulas.multivariate import GaussianMultivariate
 
 from sdv import version
-from sdv._utils import _groupby_list, check_sdv_versions_and_warn, check_synthesizer_version
+from sdv._utils import (
+    _groupby_list, check_sdv_versions_and_warn, check_synthesizer_version, generate_synthesizer_id)
 from sdv.constraints.errors import AggregateConstraintsError
 from sdv.data_processing.data_processor import DataProcessor
 from sdv.errors import ConstraintsNotMetError, InvalidDataError, SynthesizerInputError
@@ -105,6 +106,7 @@ def __init__(self, metadata, enforce_min_max_values=True, enforce_rounding=True,
         self._fitted_date = None
         self._fitted_sdv_version = None
         self._fitted_sdv_enterprise_version = None
+        self._synthesizer_id = generate_synthesizer_id(self)
 
     def set_address_columns(self, column_names, anonymization_level='full'):
         """Set the address multi-column transformer."""
@@ -438,6 +440,9 @@ def load(cls, filepath):
 
         check_synthesizer_version(synthesizer)
         check_sdv_versions_and_warn(synthesizer)
+        if getattr(synthesizer, '_synthesizer_id', None) is None:
+            synthesizer._synthesizer_id = generate_synthesizer_id(synthesizer)
+
         return synthesizer
 
 

diff --git a/sdv/single_table/ctgan.py b/sdv/single_table/ctgan.py
@@ -71,15 +71,13 @@ def get_loss_values_plot(self, title='CTGAN loss function'):
 
         # Tidy up the loss values data
         loss_df = self._model.loss_values.copy()
-        loss_df['Generator Loss'] = loss_df['Generator Loss'].apply(lambda x: x.item())
-        loss_df['Discriminator Loss'] = loss_df['Discriminator Loss'].apply(lambda x: x.item())
 
         # Create a pretty chart using Plotly Express
         fig = px.line(
             loss_df, x='Epoch',
             y=['Generator Loss', 'Discriminator Loss'],
             color_discrete_map={
-                'Generator Loss': visualization.PlotConfig.DATACEBO_BLUE,
+                'Generator Loss': visualization.PlotConfig.DATACEBO_DARK,
                 'Discriminator Loss': visualization.PlotConfig.DATACEBO_GREEN
             },
         )

diff --git a/static_code_analysis.txt b/static_code_analysis.txt
@@ -1,15 +1,15 @@
-Run started:2024-04-03 21:26:55.293630
+Run started:2024-04-16 22:26:14.110085
 
 Test results:
 >> Issue: [B110:try_except_pass] Try, Except, Pass detected.
    Severity: Low   Confidence: High
    CWE: CWE-703 (https://cwe.mitre.org/data/definitions/703.html)
    More Info: https://bandit.readthedocs.io/en/1.7.7/plugins/b110_try_except_pass.html
-   Location: ./sdv/_utils.py:379:8
-378	                return True
-379	        except Exception:
-380	            pass
-381	
+   Location: ./sdv/_utils.py:320:8
+319	
+320	        except Exception:
+321	            pass
+322	
 
 --------------------------------------------------
 >> Issue: [B105:hardcoded_password_string] Possible hardcoded password: '#'
@@ -36,28 +36,28 @@ Test results:
    Severity: Low   Confidence: High
    CWE: CWE-703 (https://cwe.mitre.org/data/definitions/703.html)
    More Info: https://bandit.readthedocs.io/en/1.7.7/plugins/b110_try_except_pass.html
-   Location: ./sdv/metadata/single_table.py:512:12
-511	
-512	            except Exception:
-513	                pass
-514	
+   Location: ./sdv/metadata/single_table.py:513:12
+512	
+513	            except Exception:
+514	                pass
+515	
 
 --------------------------------------------------
 >> Issue: [B110:try_except_pass] Try, Except, Pass detected.
    Severity: Low   Confidence: High
    CWE: CWE-703 (https://cwe.mitre.org/data/definitions/703.html)
    More Info: https://bandit.readthedocs.io/en/1.7.7/plugins/b110_try_except_pass.html
-   Location: ./sdv/multi_table/hma.py:298:12
-297	                index.append(foreign_key_value)
-298	            except Exception:
-299	                # Skip children rows subsets that fail
-300	                pass
-301	
+   Location: ./sdv/multi_table/hma.py:336:12
+335	                index.append(foreign_key_value)
+336	            except Exception:
+337	                # Skip children rows subsets that fail
+338	                pass
+339	
 
 --------------------------------------------------
 
 Code scanned:
-	Total lines of code: 10421
+	Total lines of code: 10878
 	Total lines skipped (#nosec): 0
 	Total potential issues skipped due to specifically being disabled (e.g., #nosec BXXX): 0