Skip to content

Commit

Permalink
Merge pull request #452 from machow/feat-across
Browse files Browse the repository at this point in the history
Feat across
  • Loading branch information
machow committed Oct 12, 2022
2 parents ddc63af + 3d4a79a commit 5be9e2f
Show file tree
Hide file tree
Showing 41 changed files with 2,468 additions and 1,406 deletions.
5 changes: 3 additions & 2 deletions siuba/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
__version__ = "0.3.0"

# default imports--------------------------------------------------------------
from .siu import _, Lam
from .siu import _, Fx, Lam
from .dply.across import across
from .dply.verbs import *
from .dply.verbs import __all__ as ALL_DPLY

# necessary, since _ won't be exposed in import * by default
__all__ = ['_', *ALL_DPLY]
__all__ = ['_', "Fx", "across", *ALL_DPLY]
164 changes: 164 additions & 0 deletions siuba/dply/across.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
import pandas as pd
from pandas.api import types as pd_types

from pandas.core.groupby import DataFrameGroupBy
from .verbs import var_select, var_create
from ..siu import FormulaContext, Call, strip_symbolic, Fx, FuncArg
from ..siu.dispatchers import verb_dispatch, symbolic_dispatch, create_eager_pipe_call

from collections.abc import Mapping
from contextvars import ContextVar
from contextlib import contextmanager
from typing import Callable, Any

DEFAULT_MULTI_FUNC_TEMPLATE = "{col}_{fn}"
DEFAULT_SINGLE_FUNC_TEMPLATE = "{col}"


ctx_verb_data = ContextVar("data")
ctx_verb_window = ContextVar("window")


def _is_symbolic_operator(f):
# TODO: consolidate these checks, make the result of symbolic_dispatch a class.
return callable(f) and getattr(f, "_siu_symbolic_operator", False)


def _require_across(call, verb_name):
if (
not isinstance(call, Call)
or not (call.args and getattr(call.args[0], "__name__", None) == "across")
):
raise NotImplementedError(
f"{verb_name} currently only allows a top-level across as an unnamed argument.\n\n"
f"Example: {verb_name}(some_data, across(...))"
)


def _eval_with_context(ctx, window_ctx, data, expr):
# TODO: should just set the translator as context (e.g. agg translater, etc..)
token = ctx_verb_data.set(ctx)
token_win = ctx_verb_window.set(window_ctx)

try:
return expr(data)
finally:
ctx_verb_data.reset(token)
ctx_verb_window.reset(token_win)


@contextmanager
def _set_data_context(ctx, window):
try:
token = ctx_verb_data.set(ctx)
token_win = ctx_verb_window.set(window)
yield
finally:
ctx_verb_data.reset(token)
ctx_verb_window.reset(token_win)



# TODO: handle DataFrame manipulation in pandas / sql backends
class AcrossResult(Mapping):
def __init__(self, *args, **kwargs):
self.d = dict(*args, **kwargs)

def __getitem__(self, k):
return self.d[k]

def __iter__(self):
return iter(self.d)

def __len__(self):
return len(self.d)


def _across_setup_fns(fns) -> "dict[str, Callable[[FormulaContext], Any]]":
final_calls = {}
if isinstance(fns, (list, tuple)):
raise NotImplementedError(
"Specifying functions as a list or tuple is not supported. "
"Please use a dictionary to define multiple functions to apply. \n\n"
"E.g. across(_[:], {'round': Fx.round(), 'round2': Fx.round() + 1})"
)
elif isinstance(fns, dict):
for name, fn_call_raw in fns.items():
# symbolics get stripped by default for arguments to verbs, but
# these are inside a dictionary, so need to strip manually.
fn_call = strip_symbolic(fn_call_raw)

if isinstance(fn_call, Call):
final_calls[name] = fn_call

elif callable(fn_call):
final_calls[name] = create_eager_pipe_call(FuncArg(fn_call), Fx)

else:
raise TypeError(
"All functions to be applied in across must be a siuba.siu.Call, "
f"but received a function of type {type(fn_call)}"
)

elif isinstance(fns, Call):
final_calls["fn1"] = fns

elif callable(fns):
final_calls["fn1"] = create_eager_pipe_call(FuncArg(fns), Fx)

else:
raise NotImplementedError(f"Unsupported function type in across: {type(fns)}")

return final_calls


def _get_name_template(fns, names: "str | None") -> str:
if names is not None:
return names

if callable(fns):
return DEFAULT_SINGLE_FUNC_TEMPLATE

return DEFAULT_MULTI_FUNC_TEMPLATE


@verb_dispatch(pd.DataFrame)
def across(__data, cols, fns, names: "str | None" = None) -> pd.DataFrame:

name_template = _get_name_template(fns, names)
selected_cols = var_select(__data.columns, *var_create(cols), data=__data)

fns_map = _across_setup_fns(fns)

results = {}
for old_name, new_name in selected_cols.items():
if new_name is None:
new_name = old_name

crnt_ser = __data[old_name]
context = FormulaContext(Fx=crnt_ser, _=__data)

for fn_name, fn in fns_map.items():
fmt_pars = {"fn": fn_name, "col": new_name}

res = fn(context)
results[name_template.format(**fmt_pars)] = res

# ensure at least one result is not a scalar, so we don't get the classic
# pandas error: "If using all scalar values, you must pass an index"
index = None
if results:
_, v = next(iter(results.items()))
if pd_types.is_scalar(v):
index = [0]

return pd.DataFrame(results, index=index)


@symbolic_dispatch(cls = pd.Series)
def where(x) -> bool:
if not isinstance(x, bool):
raise TypeError("Result of where clause must be a boolean (True or False).")

return x

44 changes: 42 additions & 2 deletions siuba/dply/tidyselect.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
from siuba.siu import Call, MetaArg, BinaryOp
from collections import OrderedDict
from itertools import chain
from functools import singledispatch

from typing import List

class Var:
def __init__(self, name: "str | int | slice | Call", negated = False, alias = None):
Expand Down Expand Up @@ -137,7 +140,7 @@ def flatten_var(var):
return [var]


def var_select(colnames, *args):
def var_select(colnames, *args, data=None):
# TODO: don't erase named column if included again
colnames = colnames if isinstance(colnames, pd.Series) else pd.Series(colnames)
cols = OrderedDict()
Expand All @@ -147,12 +150,15 @@ def var_select(colnames, *args):

# Add entries in pandas.rename style {"orig_name": "new_name"}
for ii, arg in enumerate(all_vars):

# strings are added directly
if isinstance(arg, str):
cols[arg] = None

# integers add colname at corresponding index
elif isinstance(arg, int):
cols[colnames.iloc[arg]] = None

# general var handling
elif isinstance(arg, Var):
# remove negated Vars, otherwise include them
Expand All @@ -165,6 +171,7 @@ def var_select(colnames, *args):
start, stop = var_slice(colnames, arg.name)
for ii in range(start, stop):
var_put_cols(colnames[ii], arg, cols)

# method calls like endswith()
elif callable(arg.name):
# TODO: not sure if this is a good idea...
Expand All @@ -176,6 +183,14 @@ def var_select(colnames, *args):
var_put_cols(colnames.iloc[arg.name], arg, cols)
else:
var_put_cols(arg.name, arg, cols)
elif callable(arg) and data is not None:
# TODO: call on the data
col_mask = colwise_eval(data, arg)

for name in colnames[col_mask]:
cols[name] = None


else:
raise Exception("variable must be either a string or Var instance")

Expand All @@ -186,14 +201,39 @@ def var_create(*args) -> "tuple[Var]":
vl = VarList()
all_vars = []
for arg in args:
if callable(arg) and not isinstance(arg, Var):
if isinstance(arg, Call):
res = arg(vl)
if isinstance(res, VarList):
raise ValueError("Must select specific column. Did you pass `_` to select?")
all_vars.append(res)
elif isinstance(arg, Var):
all_vars.append(arg)
elif callable(arg):
all_vars.append(arg)
else:
all_vars.append(Var(arg))

return tuple(all_vars)


@singledispatch
def colwise_eval(data, predicate):
raise NotImplementedError(
f"Cannot evaluate tidyselect predicate on data type: {type(data)}"
)


@colwise_eval.register
def _colwise_eval_pd(data: pd.DataFrame, predicate) -> List[bool]:
mask = []
for col_name in data:
res = predicate(data.loc[:, col_name])
if not pd.api.types.is_bool(res):
raise TypeError("TODO")

mask.append(res)

return mask



0 comments on commit 5be9e2f

Please sign in to comment.