Add ExcelHandler (#1962)

sdv-dev · May 2, 2024 · 60ae555 · 60ae555
1 parent b94cf94
commit 60ae555
Show file tree

Hide file tree

Showing 5 changed files with 389 additions and 10 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,7 +37,7 @@ dependencies = [
     'deepecho>=0.6.0',
     'rdt>=1.12.0',
     'sdmetrics>=0.14.0',
-    'platformdirs>=4.0'
+    'platformdirs>=4.0',
 ]
 
 [project.urls]
@@ -51,7 +51,9 @@ dependencies = [
 sdv = { main = 'sdv.cli.__main__:main' }
 
 [project.optional-dependencies]
+excel = ['pandas[excel]']
 test = [
+    'sdv[excel]',
     'pytest>=3.4.2',
     'pytest-cov>=2.6.0',
     'pytest-rerunfailures>=10.3,<15',

diff --git a/sdv/io/local/__init__.py b/sdv/io/local/__init__.py
@@ -1,8 +1,9 @@
 """Local I/O module."""
 
-from sdv.io.local.local import BaseLocalHandler, CSVHandler
+from sdv.io.local.local import BaseLocalHandler, CSVHandler, ExcelHandler
 
 __all__ = (
     'BaseLocalHandler',
-    'CSVHandler'
+    'CSVHandler',
+    'ExcelHandler'
 )
diff --git a/sdv/io/local/local.py b/sdv/io/local/local.py
@@ -33,7 +33,7 @@ def _infer_metadata(self, data):
         return metadata
 
     def read(self):
-        """Read data from files and returns it along with metadata.
+        """Read data from files and return it along with metadata.
 
         This method must be implemented by subclasses.
 
@@ -91,7 +91,7 @@ def __init__(self, sep=',', encoding='UTF', decimal='.', float_format=None,
         self.quoting = quoting
 
     def read(self, folder_name, file_names=None):
-        """Read data from CSV files and returns it along with metadata.
+        """Read data from CSV files and return it along with metadata.
 
         Args:
             folder_name (str):
@@ -192,3 +192,97 @@ def write(self, synthetic_data, folder_name, file_name_suffix=None, mode='x'):
                 quoting=self.quoting,
                 mode=mode,
             )
+
+
+class ExcelHandler(BaseLocalHandler):
+    """A class for handling Excel files."""
+
+    def _read_excel(self, file_path, sheet_names=None):
+        """Read data from Excel File and return just the data as a dictionary."""
+        data = {}
+        if sheet_names is None:
+            xl_file = pd.ExcelFile(file_path)
+            sheet_names = xl_file.sheet_names
+
+        for sheet_name in sheet_names:
+            data[sheet_name] = pd.read_excel(
+                file_path,
+                sheet_name=sheet_name,
+                parse_dates=False,
+                decimal=self.decimal,
+                index_col=None
+            )
+
+        return data
+
+    def read(self, file_path, sheet_names=None):
+        """Read data from Excel files and return it along with metadata.
+
+        Args:
+            file_path (str):
+                The path to the Excel file to read.
+            sheet_names (list of str, optional):
+                The names of sheets to read. If None, all sheets are read.
+
+        Returns:
+            tuple:
+                A tuple containing the data as a dictionary and metadata. The dictionary maps
+                table names to pandas DataFrames. The metadata is an object describing the data.
+        """
+        metadata = MultiTableMetadata()
+        if sheet_names is not None and not isinstance(sheet_names, list):
+            raise ValueError("'sheet_names' must be None or a list of strings.")
+
+        data = self._read_excel(file_path, sheet_names)
+        metadata = self._infer_metadata(data)
+        return data, metadata
+
+    def write(self, synthetic_data, file_name, sheet_name_suffix=None, mode='w'):
+        """Write synthetic data to an Excel File.
+
+        Args:
+            synthetic_data (dict):
+                A dictionary mapping table names to pandas DataFrames containing synthetic data.
+            file_name (str):
+                The name of the Excel file to write.
+            sheet_name_suffix (str, optional):
+                A suffix to add to each sheet name.
+            mode (str, optional):
+                The mode of writing to use. Defaults to 'w'.
+                'w': Write sheets to a new file, clearing any existing file that may exist.
+                'a': Append new sheets within the existing file.
+                     Note: You cannot append data to existing sheets.
+        """
+        temp_data = synthetic_data
+        suffix_added = False
+
+        if mode == 'a':
+            temp_data = self._read_excel(file_name)
+            for table_name, table in synthetic_data.items():
+                sheet_name = table_name
+                if sheet_name_suffix:
+                    sheet_name = f'{table_name}{sheet_name_suffix}'
+                    suffix_added = True
+
+                if temp_data.get(sheet_name) is not None:
+                    temp_data[sheet_name] = pd.concat(
+                        [temp_data[sheet_name], synthetic_data[sheet_name]],
+                        ignore_index=True
+                    )
+
+                else:
+                    temp_data[sheet_name] = table
+
+        writer = pd.ExcelWriter(file_name)
+        for table_name, table_data in temp_data.items():
+            if sheet_name_suffix and not suffix_added:
+                table_name += sheet_name_suffix
+
+            table_data.to_excel(
+                writer,
+                sheet_name=table_name,
+                float_format=self.float_format,
+                index=False
+            )
+
+        writer.close()
diff --git a/tests/integration/io/local/test_local.py b/tests/integration/io/local/test_local.py
@@ -1,13 +1,13 @@
 import pandas as pd
 
-from sdv.io.local import CSVHandler
+from sdv.io.local import CSVHandler, ExcelHandler
 from sdv.metadata import MultiTableMetadata
 
 
 class TestCSVHandler:
 
-    def test_integration_read_write(self, tmpdir):
-        """Test end to end the read and write methods of ``CSVHandler``."""
+    def test_integration_write_and_read(self, tmpdir):
+        """Test end to end the write and read methods of ``CSVHandler``."""
         # Prepare synthetic data
         synthetic_data = {
             'table1': pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}),
@@ -30,3 +30,70 @@ def test_integration_read_write(self, tmpdir):
         # Check if the dataframes match the original synthetic data
         pd.testing.assert_frame_equal(data['table1'], synthetic_data['table1'])
         pd.testing.assert_frame_equal(data['table2'], synthetic_data['table2'])
+
+
+class TestExcelHandler:
+
+    def test_integration_write_and_read(self, tmpdir):
+        """Test end to end the write and read methods of ``ExcelHandler``."""
+        # Prepare synthetic data
+        synthetic_data = {
+            'table1': pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}),
+            'table2': pd.DataFrame({'col3': [4, 5, 6], 'col4': ['d', 'e', 'f']})
+        }
+
+        # Write synthetic data to xslx files
+        handler = ExcelHandler()
+        handler.write(synthetic_data, tmpdir / 'excel.xslx')
+
+        # Read data from xslx file
+        data, metadata = handler.read(tmpdir / 'excel.xslx')
+
+        # Check if data was read correctly
+        assert len(data) == 2
+        assert 'table1' in data
+        assert 'table2' in data
+        assert isinstance(metadata, MultiTableMetadata) is True
+
+        # Check if the dataframes match the original synthetic data
+        pd.testing.assert_frame_equal(data['table1'], synthetic_data['table1'])
+        pd.testing.assert_frame_equal(data['table2'], synthetic_data['table2'])
+
+    def test_integration_write_and_read_append_mode(self, tmpdir):
+        """Test end to end the write and read methods of ``ExcelHandler``."""
+        # Prepare synthetic data
+        synthetic_data = {
+            'table1': pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}),
+            'table2': pd.DataFrame({'col3': [4, 5, 6], 'col4': ['d', 'e', 'f']})
+        }
+
+        # Write synthetic data to xslx files
+        handler = ExcelHandler()
+        handler.write(synthetic_data, tmpdir / 'excel.xslx')
+
+        # Read data from xslx file
+        data, metadata = handler.read(tmpdir / 'excel.xslx')
+
+        # Write using append mode
+        handler.write(synthetic_data, tmpdir / 'excel.xslx', mode='a')
+
+        # Read data from xslx file
+        data, metadata = handler.read(tmpdir / 'excel.xslx')
+
+        # Check if data was read correctly
+        assert len(data) == 2
+        assert 'table1' in data
+        assert 'table2' in data
+        assert isinstance(metadata, MultiTableMetadata) is True
+
+        # Check if the dataframes match the original synthetic data
+        expected_table_one = pd.concat(
+            [synthetic_data['table1'], synthetic_data['table1']],
+            ignore_index=True
+        )
+        expected_table_two = pd.concat(
+            [synthetic_data['table2'], synthetic_data['table2']],
+            ignore_index=True
+        )
+        pd.testing.assert_frame_equal(data['table1'], expected_table_one)
+        pd.testing.assert_frame_equal(data['table2'], expected_table_two)