Skip to content

Commit

Permalink
✨ Support base.glimpse() (#107, machow/siuba#409)
Browse files Browse the repository at this point in the history
  • Loading branch information
pwwang committed Apr 12, 2022
1 parent db58d46 commit 15781a6
Show file tree
Hide file tree
Showing 5 changed files with 249 additions and 20 deletions.
1 change: 1 addition & 0 deletions datar/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
rank,
outer,
)
from .glimpse import glimpse
from .logical import (
FALSE,
TRUE,
Expand Down
183 changes: 183 additions & 0 deletions datar/base/glimpse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
"""Provides glimpse"""
import textwrap
import html
from functools import singledispatch
from shutil import get_terminal_size

from pipda import register_verb

from ..core.tibble import TibbleGrouped, TibbleRowwise
from ..core.backends.pandas import DataFrame
from ..core.backends.pandas.core.groupby import SeriesGroupBy


@singledispatch
def formatter(x):
"""Formatter passed to glimpse to format a single element of a dataframe."""
return str(x)


@formatter.register(DataFrame)
def _dataframe_formatter(x):
"""Format a dataframe element."""
return f"<DF {x.shape[0]}x{x.shape[1]}>"


@formatter.register(str)
def _str_formatter(x):
"""Format a string"""
return repr(x)


def _is_notebook() -> bool: # pragma: no cover
"""Check if the current environment is notebook"""
try:
from IPython import get_ipython
shell = get_ipython().__class__.__name__
if shell == "ZMQInteractiveShell":
return True # Jupyter notebook or qtconsole
elif shell == "TerminalInteractiveShell":
return False # Terminal running IPython
else:
return False # Other type (?)
except (ImportError, NameError):
return False # Probably standard Python interpreter


class Glimpse:
"""Glimpse class
Args:
x: The data to be glimpseed
width: The width of the output
formatter: The formatter to use to format data elements
"""
def __init__(self, x, width, formatter) -> None:
self.x = x
self.width = width or get_terminal_size((100, 20)).columns
self.formatter = formatter
self.colwidths = (0, 0)

def __repr__(self) -> str:
return f"<Glimpse: {self.__hash__()}>"

def __str__(self) -> str:
self._calculate_output_widths()
return "\n".join(
(
"\n".join(self._general()),
"\n".join(self._variables()),
)
)

def _repr_html_(self):
out = []
for gen in self._general():
out.append(f"<div><i>{gen}</i></div>")
out.append("<table>")
out.extend(self._variables(fmt="html"))
out.append("</table>")
return "\n".join(out)

def _general(self):
if isinstance(self.x, TibbleGrouped):
groups = ", ".join((str(name) for name in self.x.group_vars))
group_title = (
"Rowwise" if isinstance(self.x, TibbleRowwise) else "Groups"
)
return (
f"Rows: {self.x.shape[0]}",
f"Columns: {self.x.shape[1]}",
f"{group_title}: {groups} "
f"[{self.x._datar['grouped'].grouper.ngroups}]",
)

return (
f"Rows: {self.x.shape[0]}",
f"Columns: {self.x.shape[1]}",
)

def _calculate_output_widths(self):
colname_width = max(len(str(colname)) for colname in self.x.columns)
dtype_width = max(len(str(dtype)) for dtype in self.x.dtypes) + 2
self.colwidths = (colname_width, dtype_width)

def _variables(self, fmt="str"):
for col in self.x:
yield self._format_variable(
col,
self.x[col].dtype,
self.x[col].obj.values
if isinstance(self.x[col], SeriesGroupBy)
else self.x[col].values,
fmt=fmt,
)

def _format_variable(self, col, dtype, data, fmt="str"):
if fmt == "str":
return self._format_variable_str(col, dtype, data)

return self._format_variable_html(col, dtype, data)

def _format_data(self, data):
"""Format the data for the glimpse view
Formatting 10 elements in a batch in case of a long dataframe.
Since we don't need to format all the data, but only the first a few
till the line (terminal width or provided width) overflows.
"""
out = ""
placeholder = "…"
i = 0
chunk_size = 10
while not out.endswith(placeholder) and i < data.size:
if out:
out += ", "
out += ", ".join(
self.formatter(d) for d in data[i:i + chunk_size]
)
i += chunk_size
out = textwrap.shorten(
out,
break_long_words=True,
break_on_hyphens=True,
width=self.width - 4 - sum(self.colwidths),
placeholder=placeholder,
)
return out

def _format_variable_str(self, col, dtype, data):
name_col = col.ljust(self.colwidths[0])
dtype_col = f'<{dtype}>'.ljust(self.colwidths[1])
data_col = self._format_data(data)
return f". {name_col} {dtype_col} {data_col}"

def _format_variable_html(self, col, dtype, data):
name_col = f". <b>{col}</b>"
dtype_col = f"<i>&lt;{dtype}&gt;</i>"
data_col = html.escape(self._format_data(data))
return (
f"<tr><th style=\"text-align: left\">{name_col}</th>"
f"<td style=\"text-align: left\">{dtype_col}</td>"
f"<td style=\"text-align: left\">{data_col}</td></tr>"
)

def show(self):
"""Show the glimpse view"""
if _is_notebook(): # pragma: no cover
from IPython.display import display, HTML
display(HTML(self._repr_html_()))
else:
print(self.__str__())


@register_verb(DataFrame)
def glimpse(x, width=None, formatter=formatter):
"""Get a glimpse of your data
Args:
x: An object to glimpse at.
width: Width of output, defaults to the width of the console.
formatter: A single-dispatch function to format a single element.
"""
Glimpse(x, width=width, formatter=formatter).show()
40 changes: 20 additions & 20 deletions docs/notebooks/nest.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 5,
"metadata": {
"execution": {
"iopub.execute_input": "2021-07-16T22:28:27.142333Z",
Expand Down Expand Up @@ -831,7 +831,7 @@
"49 5.0 3.3 1.4 0.2"
]
},
"execution_count": 6,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -843,7 +843,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"metadata": {
"execution": {
"iopub.execute_input": "2021-07-16T22:28:27.186712Z",
Expand Down Expand Up @@ -911,7 +911,7 @@
"2 virginica <DF 50x4>"
]
},
"execution_count": 7,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -923,7 +923,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 7,
"metadata": {
"execution": {
"iopub.execute_input": "2021-07-16T22:28:27.207533Z",
Expand Down Expand Up @@ -996,7 +996,7 @@
"2 virginica <DF 50x2> <DF 50x2>"
]
},
"execution_count": 8,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -1007,7 +1007,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"metadata": {
"execution": {
"iopub.execute_input": "2021-07-16T22:28:27.265853Z",
Expand Down Expand Up @@ -1080,7 +1080,7 @@
"2 virginica <DF 50x2> <DF 50x2>"
]
},
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -1091,7 +1091,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"metadata": {
"execution": {
"iopub.execute_input": "2021-07-16T22:28:27.333173Z",
Expand Down Expand Up @@ -1257,7 +1257,7 @@
"[TibbleGrouped: fish (n=19)]"
]
},
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -1268,7 +1268,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 10,
"metadata": {
"execution": {
"iopub.execute_input": "2021-07-16T22:28:27.401063Z",
Expand Down Expand Up @@ -1343,7 +1343,7 @@
"[TibbleGrouped: cyl (n=3)]"
]
},
"execution_count": 11,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -1363,7 +1363,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 11,
"metadata": {
"execution": {
"iopub.execute_input": "2021-07-16T22:28:27.451681Z",
Expand Down Expand Up @@ -1443,7 +1443,7 @@
"3 3 3 2"
]
},
"execution_count": 12,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -1462,7 +1462,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 12,
"metadata": {
"execution": {
"iopub.execute_input": "2021-07-16T22:28:27.535876Z",
Expand Down Expand Up @@ -1549,7 +1549,7 @@
"4 3 3.0 2.0"
]
},
"execution_count": 13,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -1560,7 +1560,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 13,
"metadata": {
"execution": {
"iopub.execute_input": "2021-07-16T22:28:27.550880Z",
Expand Down Expand Up @@ -1633,7 +1633,7 @@
"2 c 3 22"
]
},
"execution_count": 14,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -1649,7 +1649,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 14,
"metadata": {
"execution": {
"iopub.execute_input": "2021-07-16T22:28:27.614822Z",
Expand Down Expand Up @@ -1736,7 +1736,7 @@
"4 c 3 22"
]
},
"execution_count": 15,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
Expand Down
2 changes: 2 additions & 0 deletions docs/reference-maps/base.md
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,7 @@ See [here](../stats) for APIs ported from `r-stats` and [here](../utils) for API

|API|Description|Notebook example|
|---|---|---:|
|[`glimpse()`][166]|Get a glimpse of your data||
|[`cut()`][113]|Convert Numeric to Factor|[:material-notebook:][163]|
|[`diff()`][164]|Returns suitably lagged and iterated differences.|[:material-notebook:][163]|
|[`identity()`][114]|Identity Function|[:material-notebook:][163]|
Expand Down Expand Up @@ -464,3 +465,4 @@ See [here](../stats) for APIs ported from `r-stats` and [here](../utils) for API
[163]: ../../notebooks/base-funs
[164]: ../../api/datar.base.funs/#datar.base.funs.diff
[165]: ../../api/datar.base.funs/#datar.base.funs.outer
[166]: ../../api/datar.base.glimpse/#datar.base.glimpse.glimpse

0 comments on commit 15781a6

Please sign in to comment.