Skip to content

Commit

Permalink
Update logs to write to CSV (#2005)
Browse files Browse the repository at this point in the history
  • Loading branch information
frances-h committed May 15, 2024
1 parent 2d2b030 commit e2c3514
Show file tree
Hide file tree
Showing 10 changed files with 340 additions and 251 deletions.
4 changes: 3 additions & 1 deletion sdv/logging/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
"""Module for configuring loggers within the SDV library."""

from sdv.logging.logger import get_sdv_logger
from sdv.logging.utils import disable_single_table_logger, get_sdv_logger_config
from sdv.logging.utils import (
disable_single_table_logger, get_sdv_logger_config, load_logfile_dataframe)

__all__ = (
'disable_single_table_logger',
'get_sdv_logger',
'get_sdv_logger_config',
'load_logfile_dataframe'
)
31 changes: 29 additions & 2 deletions sdv/logging/logger.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,35 @@
"""SDV Logger."""

import csv
import logging
from functools import lru_cache
from io import StringIO

from sdv.logging.utils import get_sdv_logger_config


class CSVFormatter(logging.Formatter):
"""Logging formatter to convert to CSV."""

def __init__(self):
super().__init__()
self.output = StringIO()
headers = [
'LEVEL', 'EVENT', 'TIMESTAMP', 'SYNTHESIZER CLASS NAME', 'SYNTHESIZER ID',
'TOTAL NUMBER OF TABLES', 'TOTAL NUMBER OF ROWS', 'TOTAL NUMBER OF COLUMNS'
]
self.writer = csv.DictWriter(self.output, headers)

def format(self, record): # noqa: A003
"""Format the record and write to CSV."""
row = record.msg
row['LEVEL'] = record.levelname
self.writer.writerow(row)
data = self.output.getvalue()
self.output.truncate(0)
self.output.seek(0)
return data.strip()


@lru_cache()
def get_sdv_logger(logger_name):
"""Get a logger instance with the specified name and configuration.
Expand Down Expand Up @@ -38,7 +62,10 @@ def get_sdv_logger(logger_name):
formatter = None
config = logger_conf.get('loggers').get(logger_name)
log_level = getattr(logging, config.get('level', 'INFO'))
if config.get('format'):
if config.get('formatter'):
if config.get('formatter') == 'sdv.logging.logger.CSVFormatter':
formatter = CSVFormatter()
elif config.get('format'):
formatter = logging.Formatter(config.get('format'))

logger.setLevel(log_level)
Expand Down
12 changes: 8 additions & 4 deletions sdv/logging/sdv_logger_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,28 @@ loggers:
SingleTableSynthesizer:
level: INFO
propagate: false
formatter: sdv.logging.logger.CSVFormatter
handlers:
class: logging.FileHandler
filename: sdv_logs.log
filename: sdv_logs.csv
MultiTableSynthesizer:
level: INFO
propagate: false
formatter: sdv.logging.logger.CSVFormatter
handlers:
class: logging.FileHandler
filename: sdv_logs.log
filename: sdv_logs.csv
MultiTableMetadata:
level: INFO
propagate: false
formatter: sdv.logging.logger.CSVFormatter
handlers:
class: logging.FileHandler
filename: sdv_logs.log
filename: sdv_logs.csv
SingleTableMetadata:
level: INFO
propagate: false
formatter: sdv.logging.logger.CSVFormatter
handlers:
class: logging.FileHandler
filename: sdv_logs.log
filename: sdv_logs.csv
17 changes: 16 additions & 1 deletion sdv/logging/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import shutil
from pathlib import Path

import pandas as pd
import platformdirs
import yaml

Expand All @@ -25,7 +26,7 @@ def get_sdv_logger_config():

for logger in logger_conf.get('loggers', {}).values():
handler = logger.get('handlers', {})
if handler.get('filename') == 'sdv_logs.log':
if handler.get('filename') == 'sdv_logs.csv':
handler['filename'] = store_path / handler['filename']

return logger_conf
Expand All @@ -49,3 +50,17 @@ def disable_single_table_logger():
finally:
for handler in handlers:
single_table_logger.addHandler(handler)


def load_logfile_dataframe(logfile):
"""Load the SDV logfile as a pandas DataFrame with correct column headers.
Args:
logfile (str):
Path to the SDV log CSV file.
"""
column_names = [
'LEVEL', 'EVENT', 'TIMESTAMP', 'SYNTHESIZER CLASS NAME', 'SYNTHESIZER ID',
'TOTAL NUMBER OF TABLES', 'TOTAL NUMBER OF ROWS', 'TOTAL NUMBER OF COLUMNS'
]
return pd.read_csv(logfile, names=column_names)
125 changes: 50 additions & 75 deletions sdv/multi_table/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,15 +119,12 @@ def __init__(self, metadata, locales=['en_US'], synthesizer_kwargs=None):
self._fitted_sdv_version = None
self._fitted_sdv_enterprise_version = None
self._synthesizer_id = generate_synthesizer_id(self)
SYNTHESIZER_LOGGER.info(
'\nInstance:\n'
' Timestamp: %s\n'
' Synthesizer class name: %s\n'
' Synthesizer id: %s',
datetime.datetime.now(),
self.__class__.__name__,
self._synthesizer_id
)
SYNTHESIZER_LOGGER.info({
'EVENT': 'Instance',
'TIMESTAMP': datetime.datetime.now(),
'SYNTHESIZER CLASS NAME': self.__class__.__name__,
'SYNTHESIZER ID': self._synthesizer_id
})

def set_address_columns(self, table_name, column_names, anonymization_level='full'):
"""Set the address multi-column transformer.
Expand Down Expand Up @@ -403,22 +400,16 @@ def fit_processed_data(self, processed_data):
total_rows += len(table)
total_columns += len(table.columns)

SYNTHESIZER_LOGGER.info(
'\nFit processed data:\n'
' Timestamp: %s\n'
' Synthesizer class name: %s\n'
' Statistics of the fit processed data:\n'
' Total number of tables: %s\n'
' Total number of rows: %s\n'
' Total number of columns: %s\n'
' Synthesizer id: %s',
datetime.datetime.now(),
self.__class__.__name__,
len(processed_data),
total_rows,
total_columns,
self._synthesizer_id,
)
SYNTHESIZER_LOGGER.info({
'EVENT': 'Fit processed data',
'TIMESTAMP': datetime.datetime.now(),
'SYNTHESIZER CLASS NAME': self.__class__.__name__,
'SYNTHESIZER ID': self._synthesizer_id,
'TOTAL NUMBER OF TABLES': len(processed_data),
'TOTAL NUMBER OF ROWS': total_rows,
'TOTAL NUMBER OF COLUMNS': total_columns
})

check_synthesizer_version(self, is_fit_method=True, compare_operator=operator.lt)
with disable_single_table_logger():
augmented_data = self._augment_tables(processed_data)
Expand All @@ -443,22 +434,16 @@ def fit(self, data):
total_rows += len(table)
total_columns += len(table.columns)

SYNTHESIZER_LOGGER.info(
'\nFit:\n'
' Timestamp: %s\n'
' Synthesizer class name: %s\n'
' Statistics of the fit data:\n'
' Total number of tables: %s\n'
' Total number of rows: %s\n'
' Total number of columns: %s\n'
' Synthesizer id: %s',
datetime.datetime.now(),
self.__class__.__name__,
len(data),
total_rows,
total_columns,
self._synthesizer_id,
)
SYNTHESIZER_LOGGER.info({
'EVENT': 'Fit',
'TIMESTAMP': datetime.datetime.now(),
'SYNTHESIZER CLASS NAME': self.__class__.__name__,
'SYNTHESIZER ID': self._synthesizer_id,
'TOTAL NUMBER OF TABLES': len(data),
'TOTAL NUMBER OF ROWS': total_rows,
'TOTAL NUMBER OF COLUMNS': total_columns
})

check_synthesizer_version(self, is_fit_method=True, compare_operator=operator.lt)
_validate_foreign_keys_not_null(self.metadata, data)
self._check_metadata_updated()
Expand Down Expand Up @@ -511,22 +496,16 @@ def sample(self, scale=1.0):
if table in table_columns:
sampled_data[table].columns = table_columns[table]

SYNTHESIZER_LOGGER.info(
'\nSample:\n'
' Timestamp: %s\n'
' Synthesizer class name: %s\n'
' Statistics of the sample size:\n'
' Total number of tables: %s\n'
' Total number of rows: %s\n'
' Total number of columns: %s\n'
' Synthesizer id: %s',
datetime.datetime.now(),
self.__class__.__name__,
len(sampled_data),
total_rows,
total_columns,
self._synthesizer_id,
)
SYNTHESIZER_LOGGER.info({
'EVENT': 'Sample',
'TIMESTAMP': datetime.datetime.now(),
'SYNTHESIZER CLASS NAME': self.__class__.__name__,
'SYNTHESIZER ID': self._synthesizer_id,
'TOTAL NUMBER OF TABLES': len(sampled_data),
'TOTAL NUMBER OF ROWS': total_rows,
'TOTAL NUMBER OF COLUMNS': total_columns
})

return sampled_data

def get_learned_distributions(self, table_name):
Expand Down Expand Up @@ -692,15 +671,13 @@ def save(self, filepath):
Path where the instance will be serialized.
"""
synthesizer_id = getattr(self, '_synthesizer_id', None)
SYNTHESIZER_LOGGER.info(
'\nSave:\n'
' Timestamp: %s\n'
' Synthesizer class name: %s\n'
' Synthesizer id: %s',
datetime.datetime.now(),
self.__class__.__name__,
synthesizer_id
)
SYNTHESIZER_LOGGER.info({
'EVENT': 'Save',
'TIMESTAMP': datetime.datetime.now(),
'SYNTHESIZER CLASS NAME': self.__class__.__name__,
'SYNTHESIZER ID': synthesizer_id,
})

with open(filepath, 'wb') as output:
cloudpickle.dump(self, output)

Expand All @@ -724,13 +701,11 @@ def load(cls, filepath):
if getattr(synthesizer, '_synthesizer_id', None) is None:
synthesizer._synthesizer_id = generate_synthesizer_id(synthesizer)

SYNTHESIZER_LOGGER.info(
'\nLoad:\n'
' Timestamp: %s\n'
' Synthesizer class name: %s\n'
' Synthesizer id: %s',
datetime.datetime.now(),
synthesizer.__class__.__name__,
synthesizer._synthesizer_id,
)
SYNTHESIZER_LOGGER.info({
'EVENT': 'Load',
'TIMESTAMP': datetime.datetime.now(),
'SYNTHESIZER CLASS NAME': synthesizer.__class__.__name__,
'SYNTHESIZER ID': synthesizer._synthesizer_id,
})

return synthesizer

0 comments on commit e2c3514

Please sign in to comment.