Skip to content

Commit

Permalink
feat: add time and date dtypes
Browse files Browse the repository at this point in the history
Migrated from BigQuery googleapis/python-bigquery#972
  • Loading branch information
tswast committed Sep 21, 2021
1 parent 07758f2 commit f104171
Show file tree
Hide file tree
Showing 13 changed files with 1,552 additions and 13 deletions.
172 changes: 171 additions & 1 deletion db_dtypes/__init__.py
Expand Up @@ -15,8 +15,178 @@
Pandas Data Types for SQL systems (BigQuery, Spanner)
"""

from .version import __version__
import datetime

import numpy
import pandas
import pandas.compat.numpy.function
import pandas.core.algorithms
import pandas.core.arrays
import pandas.core.dtypes.base
import pandas.core.dtypes.dtypes
import pandas.core.dtypes.generic
import pandas.core.nanops
import pyarrow

from db_dtypes.version import __version__
from db_dtypes import core


date_dtype_name = "date"
time_dtype_name = "time"


@pandas.core.dtypes.dtypes.register_extension_dtype
class TimeDtype(core.BaseDatetimeDtype):
"""
Extension dtype for time data.
"""

name = time_dtype_name
type = datetime.time

def construct_array_type(self):
return TimeArray


class TimeArray(core.BaseDatetimeArray):
"""
Pandas array type containing time data
"""

# Data are stored as datetime64 values with a date of Jan 1, 1970

dtype = TimeDtype()
_epoch = datetime.datetime(1970, 1, 1)
_npepoch = numpy.datetime64(_epoch)

@classmethod
def _datetime(cls, scalar):
if isinstance(scalar, datetime.time):
return datetime.datetime.combine(cls._epoch, scalar)
elif isinstance(scalar, str):
# iso string
h, m, s = map(float, scalar.split(":"))
s, us = divmod(s, 1)
return datetime.datetime(
1970, 1, 1, int(h), int(m), int(s), int(us * 1000000)
)
else:
raise TypeError("Invalid value type", scalar)

def _box_func(self, x):
if pandas.isnull(x):
return None

try:
return x.astype("<M8[us]").astype(datetime.datetime).time()
except AttributeError:
x = numpy.datetime64(x)
return x.astype("<M8[us]").astype(datetime.datetime).time()

__return_deltas = {"timedelta", "timedelta64", "timedelta64[ns]", "<m8", "<m8[ns]"}

def astype(self, dtype, copy=True):
deltas = self._ndarray - self._npepoch
stype = str(dtype)
if stype in self.__return_deltas:
return deltas
elif stype.startswith("timedelta64[") or stype.startswith("<m8["):
return deltas.astype(dtype, copy=False)
else:
return super().astype(dtype, copy=copy)

def __arrow_array__(self, type=None):
return pyarrow.array(
self.to_numpy(), type=type if type is not None else pyarrow.time64("ns"),
)


@pandas.core.dtypes.dtypes.register_extension_dtype
class DateDtype(core.BaseDatetimeDtype):
"""
Extension dtype for time data.
"""

name = date_dtype_name
type = datetime.date

def construct_array_type(self):
return DateArray


class DateArray(core.BaseDatetimeArray):
"""
Pandas array type containing date data
"""

# Data are stored as datetime64 values with a date of Jan 1, 1970

dtype = DateDtype()

@staticmethod
def _datetime(scalar):
if isinstance(scalar, datetime.date):
return datetime.datetime(scalar.year, scalar.month, scalar.day)
elif isinstance(scalar, str):
# iso string
return datetime.datetime(*map(int, scalar.split("-")))
else:
raise TypeError("Invalid value type", scalar)

def _box_func(self, x):
if pandas.isnull(x):
return None
try:
return x.astype("<M8[us]").astype(datetime.datetime).date()
except AttributeError:
x = numpy.datetime64(x)
return x.astype("<M8[us]").astype(datetime.datetime).date()

def astype(self, dtype, copy=True):
stype = str(dtype)
if stype.startswith("datetime"):
if stype == "datetime" or stype == "datetime64":
dtype = self._ndarray.dtype
return self._ndarray.astype(dtype, copy=copy)
elif stype.startswith("<M8"):
if stype == "<M8":
dtype = self._ndarray.dtype
return self._ndarray.astype(dtype, copy=copy)

return super().astype(dtype, copy=copy)

def __arrow_array__(self, type=None):
return pyarrow.array(
self._ndarray, type=type if type is not None else pyarrow.date32(),
)

def __add__(self, other):
if isinstance(other, pandas.DateOffset):
return self.astype("object") + other

if isinstance(other, TimeArray):
return (other._ndarray - other._npepoch) + self._ndarray

return super().__add__(other)

def __radd__(self, other):
return self.__add__(other)

def __sub__(self, other):
if isinstance(other, pandas.DateOffset):
return self.astype("object") - other

if isinstance(other, self.__class__):
return self._ndarray - other._ndarray

return super().__sub__(other)


__all__ = [
"__version__",
"DateArray",
"DateDtype",
"TimeArray",
"TimeDtype",
]
210 changes: 210 additions & 0 deletions db_dtypes/core.py
@@ -0,0 +1,210 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, Optional, Sequence

import numpy
import pandas
from pandas._libs import NaT
from pandas._typing import Scalar
import pandas.compat.numpy.function
import pandas.core.algorithms
import pandas.core.arrays
import pandas.core.dtypes.base
from pandas.core.dtypes.common import is_dtype_equal, is_list_like, pandas_dtype
import pandas.core.dtypes.dtypes
import pandas.core.dtypes.generic
import pandas.core.nanops

from db_dtypes import pandas_backports


pandas_release = pandas_backports.pandas_release


class BaseDatetimeDtype(pandas.core.dtypes.base.ExtensionDtype):
na_value = NaT
kind = "o"
names = None

@classmethod
def construct_from_string(cls, name):
if name != cls.name:
raise TypeError()

return cls()


class BaseDatetimeArray(
pandas_backports.OpsMixin, pandas_backports.NDArrayBackedExtensionArray
):
def __init__(self, values, dtype=None, copy: bool = False):
if not (
isinstance(values, numpy.ndarray) and values.dtype == numpy.dtype("<M8[ns]")
):
values = self.__ndarray(values)
elif copy:
values = values.copy()

super().__init__(values=values, dtype=values.dtype)

@classmethod
def __ndarray(cls, scalars):
return numpy.array(
[None if scalar is None else cls._datetime(scalar) for scalar in scalars],
"M8[ns]",
)

@classmethod
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
if dtype is not None:
assert dtype.__class__ is cls.dtype.__class__
return cls(cls.__ndarray(scalars))

_from_sequence_of_strings = _from_sequence

def astype(self, dtype, copy=True):
dtype = pandas_dtype(dtype)
if is_dtype_equal(dtype, self.dtype):
if not copy:
return self
else:
return self.copy()

return super().astype(dtype, copy=copy)

def _cmp_method(self, other, op):
if type(other) != type(self):
return NotImplemented
return op(self._ndarray, other._ndarray)

def __setitem__(self, key, value):
if is_list_like(value):
_datetime = self._datetime
value = [_datetime(v) for v in value]
elif not pandas.isna(value):
value = self._datetime(value)
return super().__setitem__(key, value)

def _from_factorized(self, unique, original):
return self.__class__(unique)

def isna(self):
return pandas.isna(self._ndarray)

def _validate_scalar(self, value):
if pandas.isna(value):
return None

if not isinstance(value, self.dtype.type):
raise ValueError(value)

return value

def take(
self,
indices: Sequence[int],
*,
allow_fill: bool = False,
fill_value: Any = None,
):
indices = numpy.asarray(indices, dtype=numpy.intp)
data = self._ndarray
if allow_fill:
fill_value = self._validate_scalar(fill_value)
fill_value = (
numpy.datetime64()
if fill_value is None
else numpy.datetime64(self._datetime(fill_value))
)
if (indices < -1).any():
raise ValueError(
"take called with negative indexes other than -1,"
" when a fill value is provided."
)
out = data.take(indices)
if allow_fill:
out[indices == -1] = fill_value

return self.__class__(out)

# TODO: provide implementations of dropna, fillna, unique,
# factorize, argsort, searchsoeted for better performance over
# abstract implementations.

def any(
self,
*,
axis: Optional[int] = None,
out=None,
keepdims: bool = False,
skipna: bool = True,
):
pandas.compat.numpy.function.validate_any(
(), {"out": out, "keepdims": keepdims}
)
result = pandas.core.nanops.nanany(self._ndarray, axis=axis, skipna=skipna)
return result

def all(
self,
*,
axis: Optional[int] = None,
out=None,
keepdims: bool = False,
skipna: bool = True,
):
pandas.compat.numpy.function.validate_all(
(), {"out": out, "keepdims": keepdims}
)
result = pandas.core.nanops.nanall(self._ndarray, axis=axis, skipna=skipna)
return result

def min(
self, *, axis: Optional[int] = None, skipna: bool = True, **kwargs
) -> Scalar:
pandas.compat.numpy.function.validate_min((), kwargs)
result = pandas.core.nanops.nanmin(
values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna
)
return self._box_func(result)

def max(
self, *, axis: Optional[int] = None, skipna: bool = True, **kwargs
) -> Scalar:
pandas.compat.numpy.function.validate_max((), kwargs)
result = pandas.core.nanops.nanmax(
values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna
)
return self._box_func(result)

if pandas_release >= (1, 2):

def median(
self,
*,
axis: Optional[int] = None,
out=None,
overwrite_input: bool = False,
keepdims: bool = False,
skipna: bool = True,
):
pandas.compat.numpy.function.validate_median(
(),
{"out": out, "overwrite_input": overwrite_input, "keepdims": keepdims},
)
result = pandas.core.nanops.nanmedian(
self._ndarray, axis=axis, skipna=skipna
)
return self._box_func(result)

0 comments on commit f104171

Please sign in to comment.