Skip to content

Commit

Permalink
Add ExcelHandler (#1962)
Browse files Browse the repository at this point in the history
  • Loading branch information
pvk-developer committed May 2, 2024
1 parent b94cf94 commit 60ae555
Show file tree
Hide file tree
Showing 5 changed files with 389 additions and 10 deletions.
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ dependencies = [
'deepecho>=0.6.0',
'rdt>=1.12.0',
'sdmetrics>=0.14.0',
'platformdirs>=4.0'
'platformdirs>=4.0',
]

[project.urls]
Expand All @@ -51,7 +51,9 @@ dependencies = [
sdv = { main = 'sdv.cli.__main__:main' }

[project.optional-dependencies]
excel = ['pandas[excel]']
test = [
'sdv[excel]',
'pytest>=3.4.2',
'pytest-cov>=2.6.0',
'pytest-rerunfailures>=10.3,<15',
Expand Down
5 changes: 3 additions & 2 deletions sdv/io/local/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
"""Local I/O module."""

from sdv.io.local.local import BaseLocalHandler, CSVHandler
from sdv.io.local.local import BaseLocalHandler, CSVHandler, ExcelHandler

__all__ = (
'BaseLocalHandler',
'CSVHandler'
'CSVHandler',
'ExcelHandler'
)
98 changes: 96 additions & 2 deletions sdv/io/local/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def _infer_metadata(self, data):
return metadata

def read(self):
"""Read data from files and returns it along with metadata.
"""Read data from files and return it along with metadata.
This method must be implemented by subclasses.
Expand Down Expand Up @@ -91,7 +91,7 @@ def __init__(self, sep=',', encoding='UTF', decimal='.', float_format=None,
self.quoting = quoting

def read(self, folder_name, file_names=None):
"""Read data from CSV files and returns it along with metadata.
"""Read data from CSV files and return it along with metadata.
Args:
folder_name (str):
Expand Down Expand Up @@ -192,3 +192,97 @@ def write(self, synthetic_data, folder_name, file_name_suffix=None, mode='x'):
quoting=self.quoting,
mode=mode,
)


class ExcelHandler(BaseLocalHandler):
"""A class for handling Excel files."""

def _read_excel(self, file_path, sheet_names=None):
"""Read data from Excel File and return just the data as a dictionary."""
data = {}
if sheet_names is None:
xl_file = pd.ExcelFile(file_path)
sheet_names = xl_file.sheet_names

for sheet_name in sheet_names:
data[sheet_name] = pd.read_excel(
file_path,
sheet_name=sheet_name,
parse_dates=False,
decimal=self.decimal,
index_col=None
)

return data

def read(self, file_path, sheet_names=None):
"""Read data from Excel files and return it along with metadata.
Args:
file_path (str):
The path to the Excel file to read.
sheet_names (list of str, optional):
The names of sheets to read. If None, all sheets are read.
Returns:
tuple:
A tuple containing the data as a dictionary and metadata. The dictionary maps
table names to pandas DataFrames. The metadata is an object describing the data.
"""
metadata = MultiTableMetadata()
if sheet_names is not None and not isinstance(sheet_names, list):
raise ValueError("'sheet_names' must be None or a list of strings.")

data = self._read_excel(file_path, sheet_names)
metadata = self._infer_metadata(data)
return data, metadata

def write(self, synthetic_data, file_name, sheet_name_suffix=None, mode='w'):
"""Write synthetic data to an Excel File.
Args:
synthetic_data (dict):
A dictionary mapping table names to pandas DataFrames containing synthetic data.
file_name (str):
The name of the Excel file to write.
sheet_name_suffix (str, optional):
A suffix to add to each sheet name.
mode (str, optional):
The mode of writing to use. Defaults to 'w'.
'w': Write sheets to a new file, clearing any existing file that may exist.
'a': Append new sheets within the existing file.
Note: You cannot append data to existing sheets.
"""
temp_data = synthetic_data
suffix_added = False

if mode == 'a':
temp_data = self._read_excel(file_name)
for table_name, table in synthetic_data.items():
sheet_name = table_name
if sheet_name_suffix:
sheet_name = f'{table_name}{sheet_name_suffix}'
suffix_added = True

if temp_data.get(sheet_name) is not None:
temp_data[sheet_name] = pd.concat(
[temp_data[sheet_name], synthetic_data[sheet_name]],
ignore_index=True
)

else:
temp_data[sheet_name] = table

writer = pd.ExcelWriter(file_name)
for table_name, table_data in temp_data.items():
if sheet_name_suffix and not suffix_added:
table_name += sheet_name_suffix

table_data.to_excel(
writer,
sheet_name=table_name,
float_format=self.float_format,
index=False
)

writer.close()
73 changes: 70 additions & 3 deletions tests/integration/io/local/test_local.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import pandas as pd

from sdv.io.local import CSVHandler
from sdv.io.local import CSVHandler, ExcelHandler
from sdv.metadata import MultiTableMetadata


class TestCSVHandler:

def test_integration_read_write(self, tmpdir):
"""Test end to end the read and write methods of ``CSVHandler``."""
def test_integration_write_and_read(self, tmpdir):
"""Test end to end the write and read methods of ``CSVHandler``."""
# Prepare synthetic data
synthetic_data = {
'table1': pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}),
Expand All @@ -30,3 +30,70 @@ def test_integration_read_write(self, tmpdir):
# Check if the dataframes match the original synthetic data
pd.testing.assert_frame_equal(data['table1'], synthetic_data['table1'])
pd.testing.assert_frame_equal(data['table2'], synthetic_data['table2'])


class TestExcelHandler:

def test_integration_write_and_read(self, tmpdir):
"""Test end to end the write and read methods of ``ExcelHandler``."""
# Prepare synthetic data
synthetic_data = {
'table1': pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}),
'table2': pd.DataFrame({'col3': [4, 5, 6], 'col4': ['d', 'e', 'f']})
}

# Write synthetic data to xslx files
handler = ExcelHandler()
handler.write(synthetic_data, tmpdir / 'excel.xslx')

# Read data from xslx file
data, metadata = handler.read(tmpdir / 'excel.xslx')

# Check if data was read correctly
assert len(data) == 2
assert 'table1' in data
assert 'table2' in data
assert isinstance(metadata, MultiTableMetadata) is True

# Check if the dataframes match the original synthetic data
pd.testing.assert_frame_equal(data['table1'], synthetic_data['table1'])
pd.testing.assert_frame_equal(data['table2'], synthetic_data['table2'])

def test_integration_write_and_read_append_mode(self, tmpdir):
"""Test end to end the write and read methods of ``ExcelHandler``."""
# Prepare synthetic data
synthetic_data = {
'table1': pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}),
'table2': pd.DataFrame({'col3': [4, 5, 6], 'col4': ['d', 'e', 'f']})
}

# Write synthetic data to xslx files
handler = ExcelHandler()
handler.write(synthetic_data, tmpdir / 'excel.xslx')

# Read data from xslx file
data, metadata = handler.read(tmpdir / 'excel.xslx')

# Write using append mode
handler.write(synthetic_data, tmpdir / 'excel.xslx', mode='a')

# Read data from xslx file
data, metadata = handler.read(tmpdir / 'excel.xslx')

# Check if data was read correctly
assert len(data) == 2
assert 'table1' in data
assert 'table2' in data
assert isinstance(metadata, MultiTableMetadata) is True

# Check if the dataframes match the original synthetic data
expected_table_one = pd.concat(
[synthetic_data['table1'], synthetic_data['table1']],
ignore_index=True
)
expected_table_two = pd.concat(
[synthetic_data['table2'], synthetic_data['table2']],
ignore_index=True
)
pd.testing.assert_frame_equal(data['table1'], expected_table_one)
pd.testing.assert_frame_equal(data['table2'], expected_table_two)

0 comments on commit 60ae555

Please sign in to comment.