Skip to content

Commit

Permalink
Update logs to write to CSV
Browse files Browse the repository at this point in the history
  • Loading branch information
frances-h committed May 14, 2024
1 parent 4d67b9b commit 00cea6d
Show file tree
Hide file tree
Showing 8 changed files with 243 additions and 247 deletions.
4 changes: 3 additions & 1 deletion sdv/logging/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
"""Module for configuring loggers within the SDV library."""

from sdv.logging.logger import get_sdv_logger
from sdv.logging.utils import disable_single_table_logger, get_sdv_logger_config
from sdv.logging.utils import (
disable_single_table_logger, get_sdv_logger_config, load_logfile_dataframe)

__all__ = (
'disable_single_table_logger',
'get_sdv_logger',
'get_sdv_logger_config',
'load_logfile_dataframe'
)
8 changes: 4 additions & 4 deletions sdv/logging/sdv_logger_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,22 @@ loggers:
propagate: false
handlers:
class: logging.FileHandler
filename: sdv_logs.log
filename: sdv_logs.csv
MultiTableSynthesizer:
level: INFO
propagate: false
handlers:
class: logging.FileHandler
filename: sdv_logs.log
filename: sdv_logs.csv
MultiTableMetadata:
level: INFO
propagate: false
handlers:
class: logging.FileHandler
filename: sdv_logs.log
filename: sdv_logs.csv
SingleTableMetadata:
level: INFO
propagate: false
handlers:
class: logging.FileHandler
filename: sdv_logs.log
filename: sdv_logs.csv
15 changes: 15 additions & 0 deletions sdv/logging/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import shutil
from pathlib import Path

import pandas as pd
import platformdirs
import yaml

Expand Down Expand Up @@ -49,3 +50,17 @@ def disable_single_table_logger():
finally:
for handler in handlers:
single_table_logger.addHandler(handler)


def load_logfile_dataframe(logfile):
"""Load the SDV logfile as a pandas DataFrame with correct column headers.
Args:
logfile (str):
Path to the SDV log CSV file.
"""
column_names = [
'LEVEL', 'EVENT', 'TIMESTAMP', 'SYNTHESIZER CLASS NAME', 'SYNTHESIZER ID',
'TOTAL NUMBER OF TABLES', 'TOTAL NUMBER OF ROWS', 'TOTAL NUMBER OF COLUMNS'
]
return pd.read_csv(logfile, names=column_names)
125 changes: 50 additions & 75 deletions sdv/multi_table/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,15 +119,12 @@ def __init__(self, metadata, locales=['en_US'], synthesizer_kwargs=None):
self._fitted_sdv_version = None
self._fitted_sdv_enterprise_version = None
self._synthesizer_id = generate_synthesizer_id(self)
SYNTHESIZER_LOGGER.info(
'\nInstance:\n'
' Timestamp: %s\n'
' Synthesizer class name: %s\n'
' Synthesizer id: %s',
datetime.datetime.now(),
self.__class__.__name__,
self._synthesizer_id
)
SYNTHESIZER_LOGGER.info({
'EVENT': 'Instance',
'TIMESTAMP': datetime.datetime.now(),
'SYNTHESIZER CLASS NAME': self.__class__.__name__,
'SYNTHESIZER ID': self._synthesizer_id
})

def set_address_columns(self, table_name, column_names, anonymization_level='full'):
"""Set the address multi-column transformer.
Expand Down Expand Up @@ -403,22 +400,16 @@ def fit_processed_data(self, processed_data):
total_rows += len(table)
total_columns += len(table.columns)

SYNTHESIZER_LOGGER.info(
'\nFit processed data:\n'
' Timestamp: %s\n'
' Synthesizer class name: %s\n'
' Statistics of the fit processed data:\n'
' Total number of tables: %s\n'
' Total number of rows: %s\n'
' Total number of columns: %s\n'
' Synthesizer id: %s',
datetime.datetime.now(),
self.__class__.__name__,
len(processed_data),
total_rows,
total_columns,
self._synthesizer_id,
)
SYNTHESIZER_LOGGER.info({
'EVENT': 'Fit processed data',
'TIMESTAMP': datetime.datetime.now(),
'SYNTHESIZER CLASS NAME': self.__class__.__name__,
'SYNTHESIZER ID': self._synthesizer_id,
'TOTAL NUMBER OF TABLES': len(processed_data),
'TOTAL NUMBER OF ROWS': total_rows,
'TOTAL NUMBER OF COLUMNS': total_columns
})

check_synthesizer_version(self, is_fit_method=True, compare_operator=operator.lt)
with disable_single_table_logger():
augmented_data = self._augment_tables(processed_data)
Expand All @@ -443,22 +434,16 @@ def fit(self, data):
total_rows += len(table)
total_columns += len(table.columns)

SYNTHESIZER_LOGGER.info(
'\nFit:\n'
' Timestamp: %s\n'
' Synthesizer class name: %s\n'
' Statistics of the fit data:\n'
' Total number of tables: %s\n'
' Total number of rows: %s\n'
' Total number of columns: %s\n'
' Synthesizer id: %s',
datetime.datetime.now(),
self.__class__.__name__,
len(data),
total_rows,
total_columns,
self._synthesizer_id,
)
SYNTHESIZER_LOGGER.info({
'EVENT': 'Fit',
'TIMESTAMP': datetime.datetime.now(),
'SYNTHESIZER CLASS NAME': self.__class__.__name__,
'SYNTHESIZER ID': self._synthesizer_id,
'TOTAL NUMBER OF TABLES': len(data),
'TOTAL NUMBER OF ROWS': total_rows,
'TOTAL NUMBER OF COLUMNS': total_columns
})

check_synthesizer_version(self, is_fit_method=True, compare_operator=operator.lt)
_validate_foreign_keys_not_null(self.metadata, data)
self._check_metadata_updated()
Expand Down Expand Up @@ -511,22 +496,16 @@ def sample(self, scale=1.0):
if table in table_columns:
sampled_data[table].columns = table_columns[table]

SYNTHESIZER_LOGGER.info(
'\nSample:\n'
' Timestamp: %s\n'
' Synthesizer class name: %s\n'
' Statistics of the sample size:\n'
' Total number of tables: %s\n'
' Total number of rows: %s\n'
' Total number of columns: %s\n'
' Synthesizer id: %s',
datetime.datetime.now(),
self.__class__.__name__,
len(sampled_data),
total_rows,
total_columns,
self._synthesizer_id,
)
SYNTHESIZER_LOGGER.info({
'EVENT': 'Sample',
'TIMESTAMP': datetime.datetime.now(),
'SYNTHESIZER CLASS NAME': self.__class__.__name__,
'SYNTHESIZER ID': self._synthesizer_id,
'TOTAL NUMBER OF TABLES': len(sampled_data),
'TOTAL NUMBER OF ROWS': total_rows,
'TOTAL NUMBER OF COLUMNS': total_columns
})

return sampled_data

def get_learned_distributions(self, table_name):
Expand Down Expand Up @@ -692,15 +671,13 @@ def save(self, filepath):
Path where the instance will be serialized.
"""
synthesizer_id = getattr(self, '_synthesizer_id', None)
SYNTHESIZER_LOGGER.info(
'\nSave:\n'
' Timestamp: %s\n'
' Synthesizer class name: %s\n'
' Synthesizer id: %s',
datetime.datetime.now(),
self.__class__.__name__,
synthesizer_id
)
SYNTHESIZER_LOGGER.info({
'EVENT': 'Save',
'TIMESTAMP': datetime.datetime.now(),
'SYNTHESIZER CLASS NAME': self.__class__.__name__,
'SYNTHESIZER ID': synthesizer_id,
})

with open(filepath, 'wb') as output:
cloudpickle.dump(self, output)

Expand All @@ -724,13 +701,11 @@ def load(cls, filepath):
if getattr(synthesizer, '_synthesizer_id', None) is None:
synthesizer._synthesizer_id = generate_synthesizer_id(synthesizer)

SYNTHESIZER_LOGGER.info(
'\nLoad:\n'
' Timestamp: %s\n'
' Synthesizer class name: %s\n'
' Synthesizer id: %s',
datetime.datetime.now(),
synthesizer.__class__.__name__,
synthesizer._synthesizer_id,
)
SYNTHESIZER_LOGGER.info({
'EVENT': 'Load',
'TIMESTAMP': datetime.datetime.now(),
'SYNTHESIZER CLASS NAME': synthesizer.__class__.__name__,
'SYNTHESIZER ID': synthesizer._synthesizer_id,
})

return synthesizer
118 changes: 46 additions & 72 deletions sdv/single_table/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,15 +112,12 @@ def __init__(self, metadata, enforce_min_max_values=True, enforce_rounding=True,
self._fitted_sdv_version = None
self._fitted_sdv_enterprise_version = None
self._synthesizer_id = generate_synthesizer_id(self)
SYNTHESIZER_LOGGER.info(
'\nInstance:\n'
' Timestamp: %s\n'
' Synthesizer class name: %s\n'
' Synthesizer id: %s',
datetime.datetime.now(),
self.__class__.__name__,
self._synthesizer_id
)
SYNTHESIZER_LOGGER.info({
'EVENT': 'Instance',
'TIMESTAMP': datetime.datetime.now(),
'SYNTHESIZER CLASS NAME': self.__class__.__name__,
'SYNTHESIZER ID': self._synthesizer_id,
})

def set_address_columns(self, column_names, anonymization_level='full'):
"""Set the address multi-column transformer."""
Expand Down Expand Up @@ -420,21 +417,15 @@ def fit_processed_data(self, processed_data):
processed_data (pandas.DataFrame):
The transformed data used to fit the model to.
"""
SYNTHESIZER_LOGGER.info(
'\nFit processed data:\n'
' Timestamp: %s\n'
' Synthesizer class name: %s\n'
' Statistics of the fit processed data:\n'
' Total number of tables: 1\n'
' Total number of rows: %s\n'
' Total number of columns: %s\n'
' Synthesizer id: %s',
datetime.datetime.now(),
self.__class__.__name__,
len(processed_data),
len(processed_data.columns),
self._synthesizer_id,
)
SYNTHESIZER_LOGGER.info({
'EVENT': 'Fit processed data',
'TIMESTAMP': datetime.datetime.now(),
'SYNTHESIZER CLASS NAME': self.__class__.__name__,
'SYNTHESIZER ID': self._synthesizer_id,
'TOTAL NUMBER OF TABLES': 1,
'TOTAL NUMBER OF ROWS': len(processed_data),
'TOTAL NUMBER OF COLUMNS': len(processed_data.columns)
})

check_synthesizer_version(self, is_fit_method=True, compare_operator=operator.lt)
if not processed_data.empty:
Expand All @@ -452,21 +443,15 @@ def fit(self, data):
data (pandas.DataFrame):
The raw data (before any transformations) to fit the model to.
"""
SYNTHESIZER_LOGGER.info(
'\nFit:\n'
' Timestamp: %s\n'
' Synthesizer class name: %s\n'
' Statistics of the fit data:\n'
' Total number of tables: 1\n'
' Total number of rows: %s\n'
' Total number of columns: %s\n'
' Synthesizer id: %s',
datetime.datetime.now(),
self.__class__.__name__,
len(data),
len(data.columns),
self._synthesizer_id,
)
SYNTHESIZER_LOGGER.info({
'EVENT': 'Fit',
'TIMESTAMP': datetime.datetime.now(),
'SYNTHESIZER CLASS NAME': self.__class__.__name__,
'SYNTHESIZER ID': self._synthesizer_id,
'TOTAL NUMBER OF TABLES': 1,
'TOTAL NUMBER OF ROWS': len(data),
'TOTAL NUMBER OF COLUMNS': len(data.columns)
})

check_synthesizer_version(self, is_fit_method=True, compare_operator=operator.lt)
self._check_metadata_updated()
Expand All @@ -484,15 +469,12 @@ def save(self, filepath):
Path where the synthesizer instance will be serialized.
"""
synthesizer_id = getattr(self, '_synthesizer_id', None)
SYNTHESIZER_LOGGER.info(
'\nSave:\n'
' Timestamp: %s\n'
' Synthesizer class name: %s\n'
' Synthesizer id: %s',
datetime.datetime.now(),
self.__class__.__name__,
synthesizer_id
)
SYNTHESIZER_LOGGER.info({
'EVENT': 'Save',
'TIMESTAMP': datetime.datetime.now(),
'SYNTHESIZER CLASS NAME': self.__class__.__name__,
'SYNTHESIZER ID': synthesizer_id,
})

with open(filepath, 'wb') as output:
cloudpickle.dump(self, output)
Expand All @@ -517,15 +499,12 @@ def load(cls, filepath):
if getattr(synthesizer, '_synthesizer_id', None) is None:
synthesizer._synthesizer_id = generate_synthesizer_id(synthesizer)

SYNTHESIZER_LOGGER.info(
'\nLoad:\n'
' Timestamp: %s\n'
' Synthesizer class name: %s\n'
' Synthesizer id: %s',
datetime.datetime.now(),
synthesizer.__class__.__name__,
synthesizer._synthesizer_id,
)
SYNTHESIZER_LOGGER.info({
'EVENT': 'Load',
'TIMESTAMP': datetime.datetime.now(),
'SYNTHESIZER CLASS NAME': synthesizer.__class__.__name__,
'SYNTHESIZER ID': synthesizer._synthesizer_id,
})

return synthesizer

Expand Down Expand Up @@ -913,21 +892,16 @@ def sample(self, num_rows, max_tries_per_batch=100, batch_size=None, output_file
if not original_columns.empty:
sampled_data.columns = self._original_columns

SYNTHESIZER_LOGGER.info(
'\nSample:\n'
' Timestamp: %s\n'
' Synthesizer class name: %s\n'
' Statistics of the sample size:\n'
' Total number of tables: 1\n'
' Total number of rows: %s\n'
' Total number of columns: %s\n'
' Synthesizer id: %s',
sample_timestamp,
self.__class__.__name__,
len(sampled_data),
len(sampled_data.columns),
self._synthesizer_id,
)
SYNTHESIZER_LOGGER.info({
'EVENT': 'Sample',
'TIMESTAMP': sample_timestamp,
'SYNTHESIZER CLASS NAME': self.__class__.__name__,
'SYNTHESIZER ID': self._synthesizer_id,
'TOTAL NUMBER OF TABLES': 1,
'TOTAL NUMBER OF ROWS': len(sampled_data),
'TOTAL NUMBER OF COLUMNS': len(sampled_data.columns)

})

return sampled_data

Expand Down

0 comments on commit 00cea6d

Please sign in to comment.