Skip to content

Commit

Permalink
🔖 0.8.0 (#110)
Browse files Browse the repository at this point in the history
* 🩹 Attach original `__module__` to `func_factory` registed functions

* ✨ Allow configuration file to save default options;
 💥Replace option `warn_builtin_names` with `imiport_names_conflict` (#73)

* 🐛 Register `base.factor()` and accept grouped data (#108)

* ✨ Support `base.glimpse()` (#107, machow/siuba#409)

* ✅ Add tests for `base.factor()` with grouped data

* 🔖 0.8.0

* 📝 Update CHANGELOG
  • Loading branch information
pwwang committed Apr 12, 2022
1 parent 1e018fd commit 0814390
Show file tree
Hide file tree
Showing 25 changed files with 694 additions and 283 deletions.
7 changes: 4 additions & 3 deletions datar/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .core import operator as _
from .core import f, options_context, options, add_option, get_option, logger
from .core.options import apply_init_callbacks

__all__ = (
"f",
Expand All @@ -11,10 +12,10 @@
"logger",
)

options(enable_pdtypes=True)

__all__ = ("f", "get_versions")
__version__ = "0.7.2"
__version__ = "0.8.0"

apply_init_callbacks()


def get_versions(prnt: bool = True):
Expand Down
46 changes: 23 additions & 23 deletions datar/all.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,34 @@
"""Import all constants, verbs and functions"""

_locs = locals()

from . import base as _base
_base_conflict_names = _base._conflict_names
for _key in _base.__all__:
if _key not in _base_conflict_names:
_locs[_key] = getattr(_base, _key)

from . import dplyr as _dplyr
_dplyr_conflict_names = _dplyr._conflict_names
for _key in _dplyr.__all__:
if _key not in _dplyr_conflict_names:
_locs[_key] = getattr(_dplyr, _key)

from .core.defaults import f
from .base import (
_no_warn as _,
) # don't override from datar.all import _no_warn
from .base import _builtin_names as _base_builtin_names
from .base import *
from .base import _warn as _
from .forcats import *
from .datar import *
from .dplyr import _no_warn as _
from .dplyr import _builtin_names as _dplyr_builtin_names
from .dplyr import *
from .dplyr import _warn as _
from .tibble import *
from .tidyr import *
from .base import rank # overwrite dplyr.rank

_builtin_names = _base_builtin_names.copy()
_builtin_names.update(_dplyr_builtin_names)
# builtin names included
__all__ = [var_ for var_ in locals() if not var_.startswith("_")]

for name in _builtin_names:
# let __getattr__ handles the builtins, otherwise
# from datar.all import filter
# will not warn
del locals()[name]
from .core.import_names_conflict import (
handle_import_names_conflict as _handle_import_names_conflict,
)

from .core.warn_builtin_names import (
warn_builtin_names as _warn_builtin_names,
__all__, _getattr = _handle_import_names_conflict(
_locs,
_base_conflict_names | _dplyr_conflict_names,
)

__getattr__ = _warn_builtin_names(**_builtin_names)
if _getattr is not None:
__getattr__ = _getattr
24 changes: 8 additions & 16 deletions datar/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
rank,
outer,
)
from .glimpse import glimpse
from .logical import (
FALSE,
TRUE,
Expand Down Expand Up @@ -193,22 +194,13 @@
)
from .which import which, which_max, which_min

from ..core.import_names_conflict import (
handle_import_names_conflict as _handle_import_names_conflict
)

__all__ = [name for name in locals() if not name.startswith("_")]

_builtin_names = {
"min": min_,
"max": max_,
"sum": sum_,
"abs": abs_,
"round": round_,
"all": all_,
"any": any_,
"re": re_,
}
__all__.extend(_builtin_names)
_conflict_names = {"min", "max", "sum", "abs", "round", "all", "any", "re"}

# warn when builtin names are imported directly
from ..core.warn_builtin_names import warn_builtin_names
__all__, _getattr = _handle_import_names_conflict(locals(), _conflict_names)

__getattr__ = warn_builtin_names(**_builtin_names)
if _getattr is not None:
__getattr__ = _getattr
11 changes: 11 additions & 0 deletions datar/base/factor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pipda import register_func

from ..core.backends.pandas import Categorical, Series
from ..core.backends.pandas.core.groupby import SeriesGroupBy
from ..core.backends.pandas.api.types import is_categorical_dtype, is_scalar

from ..core.contexts import Context
Expand Down Expand Up @@ -71,6 +72,7 @@ def is_ordered(x) -> bool:
return _ensure_categorical(x).ordered


@register_func(None, context=Context.EVAL)
def factor(x=None, levels=None, exclude=np.nan, ordered=False):
"""encode a vector as a factor (the terms ‘category’ and ‘enumerated type’
are also used for factors).
Expand All @@ -87,6 +89,15 @@ def factor(x=None, levels=None, exclude=np.nan, ordered=False):
ordered: logical flag to determine if the levels should be regarded
as ordered (in the order given).
"""
if isinstance(x, SeriesGroupBy):
out = factor.__origfunc__(
x.obj,
levels=levels,
exclude=exclude,
ordered=ordered,
)
return Series(out, index=x.obj.index).groupby(x.grouper)

if x is None:
x = []

Expand Down
183 changes: 183 additions & 0 deletions datar/base/glimpse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
"""Provides glimpse"""
import textwrap
import html
from functools import singledispatch
from shutil import get_terminal_size

from pipda import register_verb

from ..core.tibble import TibbleGrouped, TibbleRowwise
from ..core.backends.pandas import DataFrame
from ..core.backends.pandas.core.groupby import SeriesGroupBy


@singledispatch
def formatter(x):
"""Formatter passed to glimpse to format a single element of a dataframe."""
return str(x)


@formatter.register(DataFrame)
def _dataframe_formatter(x):
"""Format a dataframe element."""
return f"<DF {x.shape[0]}x{x.shape[1]}>"


@formatter.register(str)
def _str_formatter(x):
"""Format a string"""
return repr(x)


def _is_notebook() -> bool: # pragma: no cover
"""Check if the current environment is notebook"""
try:
from IPython import get_ipython
shell = get_ipython().__class__.__name__
if shell == "ZMQInteractiveShell":
return True # Jupyter notebook or qtconsole
elif shell == "TerminalInteractiveShell":
return False # Terminal running IPython
else:
return False # Other type (?)
except (ImportError, NameError):
return False # Probably standard Python interpreter


class Glimpse:
"""Glimpse class
Args:
x: The data to be glimpseed
width: The width of the output
formatter: The formatter to use to format data elements
"""
def __init__(self, x, width, formatter) -> None:
self.x = x
self.width = width or get_terminal_size((100, 20)).columns
self.formatter = formatter
self.colwidths = (0, 0)

def __repr__(self) -> str:
return f"<Glimpse: {self.__hash__()}>"

def __str__(self) -> str:
self._calculate_output_widths()
return "\n".join(
(
"\n".join(self._general()),
"\n".join(self._variables()),
)
)

def _repr_html_(self):
out = []
for gen in self._general():
out.append(f"<div><i>{gen}</i></div>")
out.append("<table>")
out.extend(self._variables(fmt="html"))
out.append("</table>")
return "\n".join(out)

def _general(self):
if isinstance(self.x, TibbleGrouped):
groups = ", ".join((str(name) for name in self.x.group_vars))
group_title = (
"Rowwise" if isinstance(self.x, TibbleRowwise) else "Groups"
)
return (
f"Rows: {self.x.shape[0]}",
f"Columns: {self.x.shape[1]}",
f"{group_title}: {groups} "
f"[{self.x._datar['grouped'].grouper.ngroups}]",
)

return (
f"Rows: {self.x.shape[0]}",
f"Columns: {self.x.shape[1]}",
)

def _calculate_output_widths(self):
colname_width = max(len(str(colname)) for colname in self.x.columns)
dtype_width = max(len(str(dtype)) for dtype in self.x.dtypes) + 2
self.colwidths = (colname_width, dtype_width)

def _variables(self, fmt="str"):
for col in self.x:
yield self._format_variable(
col,
self.x[col].dtype,
self.x[col].obj.values
if isinstance(self.x[col], SeriesGroupBy)
else self.x[col].values,
fmt=fmt,
)

def _format_variable(self, col, dtype, data, fmt="str"):
if fmt == "str":
return self._format_variable_str(col, dtype, data)

return self._format_variable_html(col, dtype, data)

def _format_data(self, data):
"""Format the data for the glimpse view
Formatting 10 elements in a batch in case of a long dataframe.
Since we don't need to format all the data, but only the first a few
till the line (terminal width or provided width) overflows.
"""
out = ""
placeholder = "…"
i = 0
chunk_size = 10
while not out.endswith(placeholder) and i < data.size:
if out:
out += ", "
out += ", ".join(
self.formatter(d) for d in data[i:i + chunk_size]
)
i += chunk_size
out = textwrap.shorten(
out,
break_long_words=True,
break_on_hyphens=True,
width=self.width - 4 - sum(self.colwidths),
placeholder=placeholder,
)
return out

def _format_variable_str(self, col, dtype, data):
name_col = col.ljust(self.colwidths[0])
dtype_col = f'<{dtype}>'.ljust(self.colwidths[1])
data_col = self._format_data(data)
return f". {name_col} {dtype_col} {data_col}"

def _format_variable_html(self, col, dtype, data):
name_col = f". <b>{col}</b>"
dtype_col = f"<i>&lt;{dtype}&gt;</i>"
data_col = html.escape(self._format_data(data))
return (
f"<tr><th style=\"text-align: left\">{name_col}</th>"
f"<td style=\"text-align: left\">{dtype_col}</td>"
f"<td style=\"text-align: left\">{data_col}</td></tr>"
)

def show(self):
"""Show the glimpse view"""
if _is_notebook(): # pragma: no cover
from IPython.display import display, HTML
display(HTML(self._repr_html_()))
else:
print(self.__str__())


@register_verb(DataFrame)
def glimpse(x, width=None, formatter=formatter):
"""Get a glimpse of your data
Args:
x: An object to glimpse at.
width: Width of output, defaults to the width of the console.
formatter: A single-dispatch function to format a single element.
"""
Glimpse(x, width=width, formatter=formatter).show()
4 changes: 4 additions & 0 deletions datar/core/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,10 @@ def _pipda_func(__x, *args, **kwargs):
_pipda_func.__name__ = funcname
_pipda_func.__qualname__ = qualname
_pipda_func.__doc__ = doc or func.__doc__
try:
_pipda_func.__module__ = func.__module__
except AttributeError:
pass
_pipda_func.dispatched = dispatched
_pipda_func.register = _register_factory(dispatched, func)
_pipda_func.__raw__ = func
Expand Down

0 comments on commit 0814390

Please sign in to comment.