Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ExcelHandler #1962

Merged
merged 9 commits into from
May 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ dependencies = [
'deepecho>=0.6.0',
'rdt>=1.12.0',
'sdmetrics>=0.14.0',
'platformdirs>=4.0'
'platformdirs>=4.0',
]

[project.urls]
Expand All @@ -52,7 +52,9 @@ dependencies = [
sdv = { main = 'sdv.cli.__main__:main' }

[project.optional-dependencies]
excel = ['pandas[excel]']
test = [
'sdv[excel]',
'pytest>=3.4.2',
'pytest-cov>=2.6.0',
'pytest-rerunfailures>=10.3,<15',
Expand Down
5 changes: 3 additions & 2 deletions sdv/io/local/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
"""Local I/O module."""

from sdv.io.local.local import BaseLocalHandler, CSVHandler
from sdv.io.local.local import BaseLocalHandler, CSVHandler, ExcelHandler

__all__ = (
'BaseLocalHandler',
'CSVHandler'
'CSVHandler',
'ExcelHandler'
)
98 changes: 96 additions & 2 deletions sdv/io/local/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def _infer_metadata(self, data):
return metadata

def read(self):
"""Read data from files and returns it along with metadata.
"""Read data from files and return it along with metadata.

This method must be implemented by subclasses.

Expand Down Expand Up @@ -91,7 +91,7 @@ def __init__(self, sep=',', encoding='UTF', decimal='.', float_format=None,
self.quoting = quoting

def read(self, folder_name, file_names=None):
"""Read data from CSV files and returns it along with metadata.
"""Read data from CSV files and return it along with metadata.

Args:
folder_name (str):
Expand Down Expand Up @@ -192,3 +192,97 @@ def write(self, synthetic_data, folder_name, file_name_suffix=None, mode='x'):
quoting=self.quoting,
mode=mode,
)


class ExcelHandler(BaseLocalHandler):
"""A class for handling Excel files."""

def _read_excel(self, file_path, sheet_names=None):
"""Read data from Excel File and return just the data as a dictionary."""
data = {}
if sheet_names is None:
xl_file = pd.ExcelFile(file_path)
sheet_names = xl_file.sheet_names

for sheet_name in sheet_names:
data[sheet_name] = pd.read_excel(
file_path,
sheet_name=sheet_name,
parse_dates=False,
decimal=self.decimal,
index_col=None
)

return data

def read(self, file_path, sheet_names=None):
"""Read data from Excel files and return it along with metadata.

Args:
file_path (str):
The path to the Excel file to read.
sheet_names (list of str, optional):
The names of sheets to read. If None, all sheets are read.

Returns:
tuple:
A tuple containing the data as a dictionary and metadata. The dictionary maps
table names to pandas DataFrames. The metadata is an object describing the data.
"""
metadata = MultiTableMetadata()
if sheet_names is not None and not isinstance(sheet_names, list):
raise ValueError("'sheet_names' must be None or a list of strings.")

data = self._read_excel(file_path, sheet_names)
metadata = self._infer_metadata(data)
return data, metadata

def write(self, synthetic_data, file_name, sheet_name_suffix=None, mode='w'):
"""Write synthetic data to an Excel File.
amontanez24 marked this conversation as resolved.
Show resolved Hide resolved

Args:
synthetic_data (dict):
A dictionary mapping table names to pandas DataFrames containing synthetic data.
file_name (str):
The name of the Excel file to write.
sheet_name_suffix (str, optional):
A suffix to add to each sheet name.
mode (str, optional):
The mode of writing to use. Defaults to 'w'.
'w': Write sheets to a new file, clearing any existing file that may exist.
'a': Append new sheets within the existing file.
Note: You cannot append data to existing sheets.
"""
temp_data = synthetic_data
suffix_added = False

if mode == 'a':
temp_data = self._read_excel(file_name)
for table_name, table in synthetic_data.items():
sheet_name = table_name
if sheet_name_suffix:
sheet_name = f'{table_name}{sheet_name_suffix}'
suffix_added = True

if temp_data.get(sheet_name) is not None:
temp_data[sheet_name] = pd.concat(
[temp_data[sheet_name], synthetic_data[sheet_name]],
ignore_index=True
)

else:
temp_data[sheet_name] = table

writer = pd.ExcelWriter(file_name)
for table_name, table_data in temp_data.items():
if sheet_name_suffix and not suffix_added:
table_name += sheet_name_suffix

table_data.to_excel(
writer,
sheet_name=table_name,
float_format=self.float_format,
index=False
)
amontanez24 marked this conversation as resolved.
Show resolved Hide resolved

writer.close()
73 changes: 70 additions & 3 deletions tests/integration/io/local/test_local.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import pandas as pd

from sdv.io.local import CSVHandler
from sdv.io.local import CSVHandler, ExcelHandler
from sdv.metadata import MultiTableMetadata


class TestCSVHandler:

def test_integration_read_write(self, tmpdir):
"""Test end to end the read and write methods of ``CSVHandler``."""
def test_integration_write_and_read(self, tmpdir):
"""Test end to end the write and read methods of ``CSVHandler``."""
# Prepare synthetic data
synthetic_data = {
'table1': pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}),
Expand All @@ -30,3 +30,70 @@ def test_integration_read_write(self, tmpdir):
# Check if the dataframes match the original synthetic data
pd.testing.assert_frame_equal(data['table1'], synthetic_data['table1'])
pd.testing.assert_frame_equal(data['table2'], synthetic_data['table2'])


class TestExcelHandler:

def test_integration_write_and_read(self, tmpdir):
"""Test end to end the write and read methods of ``ExcelHandler``."""
# Prepare synthetic data
synthetic_data = {
'table1': pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}),
'table2': pd.DataFrame({'col3': [4, 5, 6], 'col4': ['d', 'e', 'f']})
}

# Write synthetic data to xslx files
handler = ExcelHandler()
handler.write(synthetic_data, tmpdir / 'excel.xslx')

# Read data from xslx file
data, metadata = handler.read(tmpdir / 'excel.xslx')

# Check if data was read correctly
assert len(data) == 2
assert 'table1' in data
assert 'table2' in data
assert isinstance(metadata, MultiTableMetadata) is True

# Check if the dataframes match the original synthetic data
pd.testing.assert_frame_equal(data['table1'], synthetic_data['table1'])
pd.testing.assert_frame_equal(data['table2'], synthetic_data['table2'])

def test_integration_write_and_read_append_mode(self, tmpdir):
"""Test end to end the write and read methods of ``ExcelHandler``."""
# Prepare synthetic data
synthetic_data = {
'table1': pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}),
'table2': pd.DataFrame({'col3': [4, 5, 6], 'col4': ['d', 'e', 'f']})
}

# Write synthetic data to xslx files
handler = ExcelHandler()
handler.write(synthetic_data, tmpdir / 'excel.xslx')

# Read data from xslx file
data, metadata = handler.read(tmpdir / 'excel.xslx')

# Write using append mode
handler.write(synthetic_data, tmpdir / 'excel.xslx', mode='a')

# Read data from xslx file
data, metadata = handler.read(tmpdir / 'excel.xslx')

# Check if data was read correctly
assert len(data) == 2
assert 'table1' in data
assert 'table2' in data
assert isinstance(metadata, MultiTableMetadata) is True

# Check if the dataframes match the original synthetic data
expected_table_one = pd.concat(
[synthetic_data['table1'], synthetic_data['table1']],
ignore_index=True
)
expected_table_two = pd.concat(
[synthetic_data['table2'], synthetic_data['table2']],
ignore_index=True
)
pd.testing.assert_frame_equal(data['table1'], expected_table_one)
pd.testing.assert_frame_equal(data['table2'], expected_table_two)