Skip to content

Commit

Permalink
Merge pull request #110 from tompollard/row_percent
Browse files Browse the repository at this point in the history
add option to calculate "n(%)" percentages over a row. Ref #108
  • Loading branch information
tompollard committed Jan 4, 2021
2 parents 5b3d644 + d89fde9 commit 938fdf9
Show file tree
Hide file tree
Showing 5 changed files with 207 additions and 26 deletions.
4 changes: 2 additions & 2 deletions docs/conf.py
Expand Up @@ -58,9 +58,9 @@
# built documents.
#
# The short X.Y version.
version = u'0.7.9'
version = u'0.7.10'
# The full version, including alpha/beta/rc tags.
release = u'0.7.9'
release = u'0.7.10'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -17,7 +17,7 @@
# Versions should comply with PEP440. For a discussion on single-sourcing
# the version across setup.py and the project code, see
# https://packaging.python.org/en/latest/single_source_version.html
version='0.7.9',
version='0.7.10',

description='TableOne',
long_description=long_description,
Expand Down
2 changes: 1 addition & 1 deletion tableone/__init__.py
Expand Up @@ -2,4 +2,4 @@
from .tableone import TableOne, load_dataset, tableone

__author__ = "Tom Pollard <tpollard@mit.edu>, Alistair Johnson, Jesse Raffa"
__version__ = "0.7.9"
__version__ = "0.7.10"
67 changes: 48 additions & 19 deletions tableone/tableone.py
Expand Up @@ -145,6 +145,9 @@ class TableOne(object):
overall : bool, optional
If True, add an "overall" column to the table. Smd and p-value
calculations are performed only using stratified columns.
row_percent : bool, optional
If True, compute "n (%)" percentages for categorical variables across
"groupby" rows rather than columns.
display_all : bool, optional
If True, set pd. display_options to display all columns and rows.
(default: False)
Expand Down Expand Up @@ -206,9 +209,8 @@ def __init__(self, data: pd.DataFrame, columns: Optional[list] = None,
order: Optional[dict] = None, remarks: bool = False,
label_suffix: bool = True, decimals: Union[int, dict] = 1,
smd: bool = False, overall: bool = True,
display_all: bool = False,
dip_test: bool = False,
normal_test: bool = False,
row_percent: bool = False, display_all: bool = False,
dip_test: bool = False, normal_test: bool = False,
tukey_test: bool = False) -> None:

# labels is now rename
Expand Down Expand Up @@ -348,6 +350,7 @@ def __init__(self, data: pd.DataFrame, columns: Optional[list] = None,
self._decimals = decimals
self._smd = smd
self._overall = overall
self._row_percent = row_percent

# display notes and warnings below the table
self._warnings = {}
Expand Down Expand Up @@ -928,14 +931,14 @@ def _create_cont_describe(self, data, groupby):

return df_cont

def _format_cat(self, row):
def _format_cat(self, row, col):
var = row.name[0]
if var in self._decimals:
n = self._decimals[var]
else:
n = 1
f = '{{:.{}f}}'.format(n)
return f.format(row.percent)
return f.format(row[col])

def _create_cat_describe(self, data, groupby, groupbylvls):
"""
Expand All @@ -945,6 +948,10 @@ def _create_cat_describe(self, data, groupby, groupbylvls):
----------
data : pandas DataFrame
The input dataset.
groupby : Str
Variable to group by.
groupbylvls : List
List of levels in the groupby variable.
Returns
----------
Expand All @@ -953,46 +960,64 @@ def _create_cat_describe(self, data, groupby, groupbylvls):
"""
group_dict = {}

cat_slice = data[self._categorical].copy()

for g in groupbylvls:
if groupby:
d_slice = data.loc[data[groupby] == g, self._categorical]
df = cat_slice.loc[data[groupby] == g, self._categorical]
else:
d_slice = data[self._categorical].copy()
df = cat_slice.copy()

# create a dataframe with freq, proportion
df = d_slice.copy()
# create n column and null count column
# must be done before converting values to strings
ct = df.count().to_frame(name='n')
ct.index.name = 'variable'
nulls = df.isnull().sum().to_frame(name='Missing')
nulls.index.name = 'variable'

# convert to str to handle int converted to boolean. Avoid nans.
# Convert to str to handle int converted to boolean in the index.
# Also avoid nans.
for column in df.columns:
df[column] = [str(row) if not pd.isnull(row)
else None for row in df[column].values]
cat_slice[column] = [str(row) if not pd.isnull(row)
else None for row
in cat_slice[column].values]

# create a dataframe with freq, proportion
df = df.melt().groupby(['variable',
'value']).size().to_frame(name='freq')

df['percent'] = df['freq'].div(df.freq.sum(level=0),
level=0).astype(float) * 100

# add row percent
df['percent_row'] = df['freq'].div(cat_slice[self._categorical]
.melt()
.groupby(['variable', 'value'])
.size()) * 100

# set number of decimal places for percent
if isinstance(self._decimals, int):
n = self._decimals
f = '{{:.{}f}}'.format(n)
df['percent_str'] = df['percent'].astype(float).map(f.format)
df['percent_row_str'] = df['percent_row'].astype(float).map(f.format)
elif isinstance(self._decimals, dict):
df.loc[:, 'percent_str'] = df.apply(self._format_cat, axis=1)
df.loc[:, 'percent_str'] = df.apply(self._format_cat, axis=1,
args=['percent'])
df.loc[:, 'percent_row_str'] = df.apply(self._format_cat,
axis=1,
args=['percent_row'])
else:
n = 1
f = '{{:.{}f}}'.format(n)
df['percent_str'] = df['percent'].astype(float).map(f.format)
df['percent_row_str'] = df['percent_row'].astype(float).map(f.format)

# add n column, listing total non-null values for each variable
ct = d_slice.count().to_frame(name='n')
ct.index.name = 'variable'
# join count column
df = df.join(ct)

# add null count
nulls = d_slice.isnull().sum().to_frame(name='Missing')
nulls.index.name = 'variable'
# only save null count to the first category for each variable
# do this by extracting the first category from the df row index
levels = df.reset_index()[['variable',
Expand All @@ -1004,8 +1029,12 @@ def _create_cat_describe(self, data, groupby, groupbylvls):
df = df.join(nulls)

# add summary column
df['t1_summary'] = (df.freq.map(str) + ' ('
+ df.percent_str.map(str)+')')
if self._row_percent:
df['t1_summary'] = (df.freq.map(str) + ' ('
+ df.percent_row_str.map(str)+')')
else:
df['t1_summary'] = (df.freq.map(str) + ' ('
+ df.percent_str.map(str)+')')

# add to dictionary
group_dict[g] = df
Expand Down
158 changes: 155 additions & 3 deletions test_tableone.py
Expand Up @@ -2,7 +2,8 @@
import warnings

from nose.tools import (with_setup, assert_raises, assert_equal,
assert_almost_equal, assert_list_equal)
assert_almost_equal, assert_list_equal,
assert_count_equal)
import numpy as np
import pandas as pd
from scipy import stats
Expand Down Expand Up @@ -1097,8 +1098,6 @@ def test_min_max_for_nonnormal_variables(self):
# optionally, a categorical variable for stratification
groupby = ['death']

self.data_pn

t1 = TableOne(self.data_pn, columns=columns, categorical=categorical,
groupby=groupby, nonnormal=nonnormal, decimals=decimals,
min_max=['Age'])
Expand All @@ -1110,3 +1109,156 @@ def test_min_max_for_nonnormal_variables(self):
for c, e in zip(t1_columns, expected):
cell = t1.tableone.loc[k][group][c].values[0]
assert_equal(cell, e)

@with_setup(setup, teardown)
def test_row_percent_false(self):
"""
Test row_percent=False displays n(%) for the column.
"""
# columns to summarize
columns = ['Age', 'SysABP', 'Height', 'MechVent', 'ICU', 'death']

# columns containing categorical variables
categorical = ['ICU', 'MechVent']

# set decimal places for age to 0
decimals = {"Age": 0}

# non-normal variables
nonnormal = ['Age']

# optionally, a categorical variable for stratification
groupby = ['death']
group = "Grouped by death"

# row_percent = False
t1 = TableOne(self.data_pn, columns=columns,
categorical=categorical, groupby=groupby,
nonnormal=nonnormal, decimals=decimals,
row_percent=False)

row1 = list(t1.tableone.loc["MechVent, n (%)"][group].values[0])
row1_expect = [0, '540 (54.0)', '468 (54.2)', '72 (52.9)']
assert_list_equal(row1, row1_expect)

row2 = list(t1.tableone.loc["MechVent, n (%)"][group].values[1])
row2_expect = ['', '460 (46.0)', '396 (45.8)', '64 (47.1)']
assert_list_equal(row2, row2_expect)

row3 = list(t1.tableone.loc["ICU, n (%)"][group].values[0])
row3_expect = [0, '162 (16.2)', '137 (15.9)', '25 (18.4)']
assert_list_equal(row3, row3_expect)

row4 = list(t1.tableone.loc["ICU, n (%)"][group].values[1])
row4_expect = ['', '202 (20.2)', '194 (22.5)', '8 (5.9)']
assert_list_equal(row4, row4_expect)

row5 = list(t1.tableone.loc["ICU, n (%)"][group].values[2])
row5_expect = ['', '380 (38.0)', '318 (36.8)', '62 (45.6)']
assert_list_equal(row5, row5_expect)

row6 = list(t1.tableone.loc["ICU, n (%)"][group].values[3])
row6_expect = ['', '256 (25.6)', '215 (24.9)', '41 (30.1)']
assert_list_equal(row6, row6_expect)

@with_setup(setup, teardown)
def test_row_percent_true(self):
"""
Test row_percent=True displays n(%) for the row rather than the column.
"""
# columns to summarize
columns = ['Age', 'SysABP', 'Height', 'MechVent', 'ICU', 'death']

# columns containing categorical variables
categorical = ['ICU', 'MechVent']

# set decimal places for age to 0
decimals = {"Age": 0}

# non-normal variables
nonnormal = ['Age']

# optionally, a categorical variable for stratification
groupby = ['death']
group = "Grouped by death"

# row_percent = True
t2 = TableOne(self.data_pn, columns=columns,
categorical=categorical, groupby=groupby,
nonnormal=nonnormal, decimals=decimals,
row_percent=True)

row1 = list(t2.tableone.loc["MechVent, n (%)"][group].values[0])
row1_expect = [0, '540 (100.0)', '468 (86.7)', '72 (13.3)']
assert_list_equal(row1, row1_expect)

row2 = list(t2.tableone.loc["MechVent, n (%)"][group].values[1])
row2_expect = ['', '460 (100.0)', '396 (86.1)', '64 (13.9)']
assert_list_equal(row2, row2_expect)

row3 = list(t2.tableone.loc["ICU, n (%)"][group].values[0])
row3_expect = [0, '162 (100.0)', '137 (84.6)', '25 (15.4)']
assert_list_equal(row3, row3_expect)

row4 = list(t2.tableone.loc["ICU, n (%)"][group].values[1])
row4_expect = ['', '202 (100.0)', '194 (96.0)', '8 (4.0)']
assert_list_equal(row4, row4_expect)

row5 = list(t2.tableone.loc["ICU, n (%)"][group].values[2])
row5_expect = ['', '380 (100.0)', '318 (83.7)', '62 (16.3)']
assert_list_equal(row5, row5_expect)

row6 = list(t2.tableone.loc["ICU, n (%)"][group].values[3])
row6_expect = ['', '256 (100.0)', '215 (84.0)', '41 (16.0)']
assert_list_equal(row6, row6_expect)

@with_setup(setup, teardown)
def test_row_percent_true_and_overall_false(self):
"""
Test row_percent=True displays n(%) for the row rather than the column.
"""
# columns to summarize
columns = ['Age', 'SysABP', 'Height', 'MechVent', 'ICU', 'death']

# columns containing categorical variables
categorical = ['ICU', 'MechVent']

# set decimal places for age to 0
decimals = {"Age": 0}

# non-normal variables
nonnormal = ['Age']

# optionally, a categorical variable for stratification
groupby = ['death']
group = "Grouped by death"

# row_percent = True
t1 = TableOne(self.data_pn, columns=columns, overall=False,
categorical=categorical, groupby=groupby,
nonnormal=nonnormal, decimals=decimals,
row_percent=True)

row1 = list(t1.tableone.loc["MechVent, n (%)"][group].values[0])
row1_expect = [0, '468 (86.7)', '72 (13.3)']
assert_list_equal(row1, row1_expect)

row2 = list(t1.tableone.loc["MechVent, n (%)"][group].values[1])
row2_expect = ['', '396 (86.1)', '64 (13.9)']
assert_list_equal(row2, row2_expect)

row3 = list(t1.tableone.loc["ICU, n (%)"][group].values[0])
row3_expect = [0, '137 (84.6)', '25 (15.4)']
assert_list_equal(row3, row3_expect)

row4 = list(t1.tableone.loc["ICU, n (%)"][group].values[1])
row4_expect = ['', '194 (96.0)', '8 (4.0)']
assert_list_equal(row4, row4_expect)

row5 = list(t1.tableone.loc["ICU, n (%)"][group].values[2])
row5_expect = ['', '318 (83.7)', '62 (16.3)']
assert_list_equal(row5, row5_expect)

row6 = list(t1.tableone.loc["ICU, n (%)"][group].values[3])
row6_expect = ['', '215 (84.0)', '41 (16.0)']
assert_list_equal(row6, row6_expect)

0 comments on commit 938fdf9

Please sign in to comment.