Merge pull request #110 from tompollard/row_percent

add option to calculate "n(%)" percentages over a row. Ref #108
tompollard · Jan 4, 2021 · 938fdf9 · 938fdf9
2 parents 5b3d644 + d89fde9
commit 938fdf9
Show file tree

Hide file tree

Showing 5 changed files with 207 additions and 26 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -58,9 +58,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = u'0.7.9'
+version = u'0.7.10'
 # The full version, including alpha/beta/rc tags.
-release = u'0.7.9'
+release = u'0.7.10'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/setup.py b/setup.py
@@ -17,7 +17,7 @@
     # Versions should comply with PEP440. For a discussion on single-sourcing
     # the version across setup.py and the project code, see
     # https://packaging.python.org/en/latest/single_source_version.html
-    version='0.7.9',
+    version='0.7.10',
 
     description='TableOne',
     long_description=long_description,

diff --git a/tableone/__init__.py b/tableone/__init__.py
@@ -2,4 +2,4 @@
 from .tableone import TableOne, load_dataset, tableone
 
 __author__ = "Tom Pollard <tpollard@mit.edu>, Alistair Johnson, Jesse Raffa"
-__version__ = "0.7.9"
+__version__ = "0.7.10"
diff --git a/tableone/tableone.py b/tableone/tableone.py
@@ -145,6 +145,9 @@ class TableOne(object):
     overall : bool, optional
         If True, add an "overall" column to the table. Smd and p-value
         calculations are performed only using stratified columns.
+    row_percent : bool, optional
+        If True, compute "n (%)" percentages for categorical variables across
+        "groupby" rows rather than columns.
     display_all : bool, optional
         If True, set pd. display_options to display all columns and rows.
         (default: False)
@@ -206,9 +209,8 @@ def __init__(self, data: pd.DataFrame, columns: Optional[list] = None,
                  order: Optional[dict] = None, remarks: bool = False,
                  label_suffix: bool = True, decimals: Union[int, dict] = 1,
                  smd: bool = False, overall: bool = True,
-                 display_all: bool = False,
-                 dip_test: bool = False,
-                 normal_test: bool = False,
+                 row_percent: bool = False, display_all: bool = False,
+                 dip_test: bool = False, normal_test: bool = False,
                  tukey_test: bool = False) -> None:
 
         # labels is now rename
@@ -348,6 +350,7 @@ def __init__(self, data: pd.DataFrame, columns: Optional[list] = None,
         self._decimals = decimals
         self._smd = smd
         self._overall = overall
+        self._row_percent = row_percent
 
         # display notes and warnings below the table
         self._warnings = {}
@@ -928,14 +931,14 @@ def _create_cont_describe(self, data, groupby):
 
         return df_cont
 
-    def _format_cat(self, row):
+    def _format_cat(self, row, col):
         var = row.name[0]
         if var in self._decimals:
             n = self._decimals[var]
         else:
             n = 1
         f = '{{:.{}f}}'.format(n)
-        return f.format(row.percent)
+        return f.format(row[col])
 
     def _create_cat_describe(self, data, groupby, groupbylvls):
         """
@@ -945,6 +948,10 @@ def _create_cat_describe(self, data, groupby, groupbylvls):
         ----------
             data : pandas DataFrame
                 The input dataset.
+            groupby : Str
+                Variable to group by.
+            groupbylvls : List
+                List of levels in the groupby variable.
 
         Returns
         ----------
@@ -953,46 +960,64 @@ def _create_cat_describe(self, data, groupby, groupbylvls):
         """
         group_dict = {}
 
+        cat_slice = data[self._categorical].copy()
+
         for g in groupbylvls:
             if groupby:
-                d_slice = data.loc[data[groupby] == g, self._categorical]
+                df = cat_slice.loc[data[groupby] == g, self._categorical]
             else:
-                d_slice = data[self._categorical].copy()
+                df = cat_slice.copy()
 
-            # create a dataframe with freq, proportion
-            df = d_slice.copy()
+            # create n column and null count column
+            # must be done before converting values to strings
+            ct = df.count().to_frame(name='n')
+            ct.index.name = 'variable'
+            nulls = df.isnull().sum().to_frame(name='Missing')
+            nulls.index.name = 'variable'
 
-            # convert to str to handle int converted to boolean. Avoid nans.
+            # Convert to str to handle int converted to boolean in the index.
+            # Also avoid nans.
             for column in df.columns:
                 df[column] = [str(row) if not pd.isnull(row)
                               else None for row in df[column].values]
+                cat_slice[column] = [str(row) if not pd.isnull(row)
+                                     else None for row
+                                     in cat_slice[column].values]
 
+            # create a dataframe with freq, proportion
             df = df.melt().groupby(['variable',
                                     'value']).size().to_frame(name='freq')
 
             df['percent'] = df['freq'].div(df.freq.sum(level=0),
                                            level=0).astype(float) * 100
 
+            # add row percent
+            df['percent_row'] = df['freq'].div(cat_slice[self._categorical]
+                                               .melt()
+                                               .groupby(['variable', 'value'])
+                                               .size()) * 100
+
             # set number of decimal places for percent
             if isinstance(self._decimals, int):
                 n = self._decimals
                 f = '{{:.{}f}}'.format(n)
                 df['percent_str'] = df['percent'].astype(float).map(f.format)
+                df['percent_row_str'] = df['percent_row'].astype(float).map(f.format)
             elif isinstance(self._decimals, dict):
-                df.loc[:, 'percent_str'] = df.apply(self._format_cat, axis=1)
+                df.loc[:, 'percent_str'] = df.apply(self._format_cat, axis=1,
+                                                    args=['percent'])
+                df.loc[:, 'percent_row_str'] = df.apply(self._format_cat,
+                                                        axis=1,
+                                                        args=['percent_row'])
             else:
                 n = 1
                 f = '{{:.{}f}}'.format(n)
                 df['percent_str'] = df['percent'].astype(float).map(f.format)
+                df['percent_row_str'] = df['percent_row'].astype(float).map(f.format)
 
-            # add n column, listing total non-null values for each variable
-            ct = d_slice.count().to_frame(name='n')
-            ct.index.name = 'variable'
+            # join count column
             df = df.join(ct)
 
-            # add null count
-            nulls = d_slice.isnull().sum().to_frame(name='Missing')
-            nulls.index.name = 'variable'
             # only save null count to the first category for each variable
             # do this by extracting the first category from the df row index
             levels = df.reset_index()[['variable',
@@ -1004,8 +1029,12 @@ def _create_cat_describe(self, data, groupby, groupbylvls):
             df = df.join(nulls)
 
             # add summary column
-            df['t1_summary'] = (df.freq.map(str) + ' ('
-                                + df.percent_str.map(str)+')')
+            if self._row_percent:
+                df['t1_summary'] = (df.freq.map(str) + ' ('
+                                    + df.percent_row_str.map(str)+')')
+            else:
+                df['t1_summary'] = (df.freq.map(str) + ' ('
+                                    + df.percent_str.map(str)+')')
 
             # add to dictionary
             group_dict[g] = df

diff --git a/test_tableone.py b/test_tableone.py
@@ -2,7 +2,8 @@
 import warnings
 
 from nose.tools import (with_setup, assert_raises, assert_equal,
-                        assert_almost_equal, assert_list_equal)
+                        assert_almost_equal, assert_list_equal,
+                        assert_count_equal)
 import numpy as np
 import pandas as pd
 from scipy import stats
@@ -1097,8 +1098,6 @@ def test_min_max_for_nonnormal_variables(self):
         # optionally, a categorical variable for stratification
         groupby = ['death']
 
-        self.data_pn
-
         t1 = TableOne(self.data_pn, columns=columns, categorical=categorical,
                       groupby=groupby, nonnormal=nonnormal, decimals=decimals,
                       min_max=['Age'])
@@ -1110,3 +1109,156 @@ def test_min_max_for_nonnormal_variables(self):
         for c, e in zip(t1_columns, expected):
             cell = t1.tableone.loc[k][group][c].values[0]
             assert_equal(cell, e)
+
+    @with_setup(setup, teardown)
+    def test_row_percent_false(self):
+        """
+        Test row_percent=False displays n(%) for the column.
+        """
+        # columns to summarize
+        columns = ['Age', 'SysABP', 'Height', 'MechVent', 'ICU', 'death']
+
+        # columns containing categorical variables
+        categorical = ['ICU', 'MechVent']
+
+        # set decimal places for age to 0
+        decimals = {"Age": 0}
+
+        # non-normal variables
+        nonnormal = ['Age']
+
+        # optionally, a categorical variable for stratification
+        groupby = ['death']
+        group = "Grouped by death"
+
+        # row_percent = False
+        t1 = TableOne(self.data_pn, columns=columns,
+                      categorical=categorical, groupby=groupby,
+                      nonnormal=nonnormal, decimals=decimals,
+                      row_percent=False)
+
+        row1 = list(t1.tableone.loc["MechVent, n (%)"][group].values[0])
+        row1_expect = [0, '540 (54.0)', '468 (54.2)', '72 (52.9)']
+        assert_list_equal(row1, row1_expect)
+
+        row2 = list(t1.tableone.loc["MechVent, n (%)"][group].values[1])
+        row2_expect = ['', '460 (46.0)', '396 (45.8)', '64 (47.1)']
+        assert_list_equal(row2, row2_expect)
+
+        row3 = list(t1.tableone.loc["ICU, n (%)"][group].values[0])
+        row3_expect = [0, '162 (16.2)', '137 (15.9)', '25 (18.4)']
+        assert_list_equal(row3, row3_expect)
+
+        row4 = list(t1.tableone.loc["ICU, n (%)"][group].values[1])
+        row4_expect = ['', '202 (20.2)', '194 (22.5)', '8 (5.9)']
+        assert_list_equal(row4, row4_expect)
+
+        row5 = list(t1.tableone.loc["ICU, n (%)"][group].values[2])
+        row5_expect = ['', '380 (38.0)', '318 (36.8)', '62 (45.6)']
+        assert_list_equal(row5, row5_expect)
+
+        row6 = list(t1.tableone.loc["ICU, n (%)"][group].values[3])
+        row6_expect = ['', '256 (25.6)', '215 (24.9)', '41 (30.1)']
+        assert_list_equal(row6, row6_expect)
+
+    @with_setup(setup, teardown)
+    def test_row_percent_true(self):
+        """
+        Test row_percent=True displays n(%) for the row rather than the column.
+        """
+        # columns to summarize
+        columns = ['Age', 'SysABP', 'Height', 'MechVent', 'ICU', 'death']
+
+        # columns containing categorical variables
+        categorical = ['ICU', 'MechVent']
+
+        # set decimal places for age to 0
+        decimals = {"Age": 0}
+
+        # non-normal variables
+        nonnormal = ['Age']
+
+        # optionally, a categorical variable for stratification
+        groupby = ['death']
+        group = "Grouped by death"
+
+        # row_percent = True
+        t2 = TableOne(self.data_pn, columns=columns,
+                      categorical=categorical, groupby=groupby,
+                      nonnormal=nonnormal, decimals=decimals,
+                      row_percent=True)
+
+        row1 = list(t2.tableone.loc["MechVent, n (%)"][group].values[0])
+        row1_expect = [0, '540 (100.0)', '468 (86.7)', '72 (13.3)']
+        assert_list_equal(row1, row1_expect)
+
+        row2 = list(t2.tableone.loc["MechVent, n (%)"][group].values[1])
+        row2_expect = ['', '460 (100.0)', '396 (86.1)', '64 (13.9)']
+        assert_list_equal(row2, row2_expect)
+
+        row3 = list(t2.tableone.loc["ICU, n (%)"][group].values[0])
+        row3_expect = [0, '162 (100.0)', '137 (84.6)', '25 (15.4)']
+        assert_list_equal(row3, row3_expect)
+
+        row4 = list(t2.tableone.loc["ICU, n (%)"][group].values[1])
+        row4_expect = ['', '202 (100.0)', '194 (96.0)', '8 (4.0)']
+        assert_list_equal(row4, row4_expect)
+
+        row5 = list(t2.tableone.loc["ICU, n (%)"][group].values[2])
+        row5_expect = ['', '380 (100.0)', '318 (83.7)', '62 (16.3)']
+        assert_list_equal(row5, row5_expect)
+
+        row6 = list(t2.tableone.loc["ICU, n (%)"][group].values[3])
+        row6_expect = ['', '256 (100.0)', '215 (84.0)', '41 (16.0)']
+        assert_list_equal(row6, row6_expect)
+
+    @with_setup(setup, teardown)
+    def test_row_percent_true_and_overall_false(self):
+        """
+        Test row_percent=True displays n(%) for the row rather than the column.
+        """
+        # columns to summarize
+        columns = ['Age', 'SysABP', 'Height', 'MechVent', 'ICU', 'death']
+
+        # columns containing categorical variables
+        categorical = ['ICU', 'MechVent']
+
+        # set decimal places for age to 0
+        decimals = {"Age": 0}
+
+        # non-normal variables
+        nonnormal = ['Age']
+
+        # optionally, a categorical variable for stratification
+        groupby = ['death']
+        group = "Grouped by death"
+
+        # row_percent = True
+        t1 = TableOne(self.data_pn, columns=columns, overall=False,
+                      categorical=categorical, groupby=groupby,
+                      nonnormal=nonnormal, decimals=decimals,
+                      row_percent=True)
+
+        row1 = list(t1.tableone.loc["MechVent, n (%)"][group].values[0])
+        row1_expect = [0, '468 (86.7)', '72 (13.3)']
+        assert_list_equal(row1, row1_expect)
+
+        row2 = list(t1.tableone.loc["MechVent, n (%)"][group].values[1])
+        row2_expect = ['', '396 (86.1)', '64 (13.9)']
+        assert_list_equal(row2, row2_expect)
+
+        row3 = list(t1.tableone.loc["ICU, n (%)"][group].values[0])
+        row3_expect = [0, '137 (84.6)', '25 (15.4)']
+        assert_list_equal(row3, row3_expect)
+
+        row4 = list(t1.tableone.loc["ICU, n (%)"][group].values[1])
+        row4_expect = ['', '194 (96.0)', '8 (4.0)']
+        assert_list_equal(row4, row4_expect)
+
+        row5 = list(t1.tableone.loc["ICU, n (%)"][group].values[2])
+        row5_expect = ['', '318 (83.7)', '62 (16.3)']
+        assert_list_equal(row5, row5_expect)
+
+        row6 = list(t1.tableone.loc["ICU, n (%)"][group].values[3])
+        row6_expect = ['', '215 (84.0)', '41 (16.0)']
+        assert_list_equal(row6, row6_expect)