feat: add time and date dtypes

Migrated from BigQuery googleapis/python-bigquery#972
googleapis · Sep 21, 2021 · f104171 · f104171
1 parent 07758f2
commit f104171
Show file tree

Hide file tree

Showing 13 changed files with 1,552 additions and 13 deletions.
diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py
@@ -15,8 +15,178 @@
 Pandas Data Types for SQL systems (BigQuery, Spanner)
 """
 
-from .version import __version__
+import datetime
+
+import numpy
+import pandas
+import pandas.compat.numpy.function
+import pandas.core.algorithms
+import pandas.core.arrays
+import pandas.core.dtypes.base
+import pandas.core.dtypes.dtypes
+import pandas.core.dtypes.generic
+import pandas.core.nanops
+import pyarrow
+
+from db_dtypes.version import __version__
+from db_dtypes import core
+
+
+date_dtype_name = "date"
+time_dtype_name = "time"
+
+
+@pandas.core.dtypes.dtypes.register_extension_dtype
+class TimeDtype(core.BaseDatetimeDtype):
+    """
+    Extension dtype for time data.
+    """
+
+    name = time_dtype_name
+    type = datetime.time
+
+    def construct_array_type(self):
+        return TimeArray
+
+
+class TimeArray(core.BaseDatetimeArray):
+    """
+    Pandas array type containing time data
+    """
+
+    # Data are stored as datetime64 values with a date of Jan 1, 1970
+
+    dtype = TimeDtype()
+    _epoch = datetime.datetime(1970, 1, 1)
+    _npepoch = numpy.datetime64(_epoch)
+
+    @classmethod
+    def _datetime(cls, scalar):
+        if isinstance(scalar, datetime.time):
+            return datetime.datetime.combine(cls._epoch, scalar)
+        elif isinstance(scalar, str):
+            # iso string
+            h, m, s = map(float, scalar.split(":"))
+            s, us = divmod(s, 1)
+            return datetime.datetime(
+                1970, 1, 1, int(h), int(m), int(s), int(us * 1000000)
+            )
+        else:
+            raise TypeError("Invalid value type", scalar)
+
+    def _box_func(self, x):
+        if pandas.isnull(x):
+            return None
+
+        try:
+            return x.astype("<M8[us]").astype(datetime.datetime).time()
+        except AttributeError:
+            x = numpy.datetime64(x)
+            return x.astype("<M8[us]").astype(datetime.datetime).time()
+
+    __return_deltas = {"timedelta", "timedelta64", "timedelta64[ns]", "<m8", "<m8[ns]"}
+
+    def astype(self, dtype, copy=True):
+        deltas = self._ndarray - self._npepoch
+        stype = str(dtype)
+        if stype in self.__return_deltas:
+            return deltas
+        elif stype.startswith("timedelta64[") or stype.startswith("<m8["):
+            return deltas.astype(dtype, copy=False)
+        else:
+            return super().astype(dtype, copy=copy)
+
+    def __arrow_array__(self, type=None):
+        return pyarrow.array(
+            self.to_numpy(), type=type if type is not None else pyarrow.time64("ns"),
+        )
+
+
+@pandas.core.dtypes.dtypes.register_extension_dtype
+class DateDtype(core.BaseDatetimeDtype):
+    """
+    Extension dtype for time data.
+    """
+
+    name = date_dtype_name
+    type = datetime.date
+
+    def construct_array_type(self):
+        return DateArray
+
+
+class DateArray(core.BaseDatetimeArray):
+    """
+    Pandas array type containing date data
+    """
+
+    # Data are stored as datetime64 values with a date of Jan 1, 1970
+
+    dtype = DateDtype()
+
+    @staticmethod
+    def _datetime(scalar):
+        if isinstance(scalar, datetime.date):
+            return datetime.datetime(scalar.year, scalar.month, scalar.day)
+        elif isinstance(scalar, str):
+            # iso string
+            return datetime.datetime(*map(int, scalar.split("-")))
+        else:
+            raise TypeError("Invalid value type", scalar)
+
+    def _box_func(self, x):
+        if pandas.isnull(x):
+            return None
+        try:
+            return x.astype("<M8[us]").astype(datetime.datetime).date()
+        except AttributeError:
+            x = numpy.datetime64(x)
+            return x.astype("<M8[us]").astype(datetime.datetime).date()
+
+    def astype(self, dtype, copy=True):
+        stype = str(dtype)
+        if stype.startswith("datetime"):
+            if stype == "datetime" or stype == "datetime64":
+                dtype = self._ndarray.dtype
+            return self._ndarray.astype(dtype, copy=copy)
+        elif stype.startswith("<M8"):
+            if stype == "<M8":
+                dtype = self._ndarray.dtype
+            return self._ndarray.astype(dtype, copy=copy)
+
+        return super().astype(dtype, copy=copy)
+
+    def __arrow_array__(self, type=None):
+        return pyarrow.array(
+            self._ndarray, type=type if type is not None else pyarrow.date32(),
+        )
+
+    def __add__(self, other):
+        if isinstance(other, pandas.DateOffset):
+            return self.astype("object") + other
+
+        if isinstance(other, TimeArray):
+            return (other._ndarray - other._npepoch) + self._ndarray
+
+        return super().__add__(other)
+
+    def __radd__(self, other):
+        return self.__add__(other)
+
+    def __sub__(self, other):
+        if isinstance(other, pandas.DateOffset):
+            return self.astype("object") - other
+
+        if isinstance(other, self.__class__):
+            return self._ndarray - other._ndarray
+
+        return super().__sub__(other)
+
 
 __all__ = [
     "__version__",
+    "DateArray",
+    "DateDtype",
+    "TimeArray",
+    "TimeDtype",
 ]
diff --git a/db_dtypes/core.py b/db_dtypes/core.py
@@ -0,0 +1,210 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Optional, Sequence
+
+import numpy
+import pandas
+from pandas._libs import NaT
+from pandas._typing import Scalar
+import pandas.compat.numpy.function
+import pandas.core.algorithms
+import pandas.core.arrays
+import pandas.core.dtypes.base
+from pandas.core.dtypes.common import is_dtype_equal, is_list_like, pandas_dtype
+import pandas.core.dtypes.dtypes
+import pandas.core.dtypes.generic
+import pandas.core.nanops
+
+from db_dtypes import pandas_backports
+
+
+pandas_release = pandas_backports.pandas_release
+
+
+class BaseDatetimeDtype(pandas.core.dtypes.base.ExtensionDtype):
+    na_value = NaT
+    kind = "o"
+    names = None
+
+    @classmethod
+    def construct_from_string(cls, name):
+        if name != cls.name:
+            raise TypeError()
+
+        return cls()
+
+
+class BaseDatetimeArray(
+    pandas_backports.OpsMixin, pandas_backports.NDArrayBackedExtensionArray
+):
+    def __init__(self, values, dtype=None, copy: bool = False):
+        if not (
+            isinstance(values, numpy.ndarray) and values.dtype == numpy.dtype("<M8[ns]")
+        ):
+            values = self.__ndarray(values)
+        elif copy:
+            values = values.copy()
+
+        super().__init__(values=values, dtype=values.dtype)
+
+    @classmethod
+    def __ndarray(cls, scalars):
+        return numpy.array(
+            [None if scalar is None else cls._datetime(scalar) for scalar in scalars],
+            "M8[ns]",
+        )
+
+    @classmethod
+    def _from_sequence(cls, scalars, *, dtype=None, copy=False):
+        if dtype is not None:
+            assert dtype.__class__ is cls.dtype.__class__
+        return cls(cls.__ndarray(scalars))
+
+    _from_sequence_of_strings = _from_sequence
+
+    def astype(self, dtype, copy=True):
+        dtype = pandas_dtype(dtype)
+        if is_dtype_equal(dtype, self.dtype):
+            if not copy:
+                return self
+            else:
+                return self.copy()
+
+        return super().astype(dtype, copy=copy)
+
+    def _cmp_method(self, other, op):
+        if type(other) != type(self):
+            return NotImplemented
+        return op(self._ndarray, other._ndarray)
+
+    def __setitem__(self, key, value):
+        if is_list_like(value):
+            _datetime = self._datetime
+            value = [_datetime(v) for v in value]
+        elif not pandas.isna(value):
+            value = self._datetime(value)
+        return super().__setitem__(key, value)
+
+    def _from_factorized(self, unique, original):
+        return self.__class__(unique)
+
+    def isna(self):
+        return pandas.isna(self._ndarray)
+
+    def _validate_scalar(self, value):
+        if pandas.isna(value):
+            return None
+
+        if not isinstance(value, self.dtype.type):
+            raise ValueError(value)
+
+        return value
+
+    def take(
+        self,
+        indices: Sequence[int],
+        *,
+        allow_fill: bool = False,
+        fill_value: Any = None,
+    ):
+        indices = numpy.asarray(indices, dtype=numpy.intp)
+        data = self._ndarray
+        if allow_fill:
+            fill_value = self._validate_scalar(fill_value)
+            fill_value = (
+                numpy.datetime64()
+                if fill_value is None
+                else numpy.datetime64(self._datetime(fill_value))
+            )
+            if (indices < -1).any():
+                raise ValueError(
+                    "take called with negative indexes other than -1,"
+                    " when a fill value is provided."
+                )
+        out = data.take(indices)
+        if allow_fill:
+            out[indices == -1] = fill_value
+
+        return self.__class__(out)
+
+    # TODO: provide implementations of dropna, fillna, unique,
+    # factorize, argsort, searchsoeted for better performance over
+    # abstract implementations.
+
+    def any(
+        self,
+        *,
+        axis: Optional[int] = None,
+        out=None,
+        keepdims: bool = False,
+        skipna: bool = True,
+    ):
+        pandas.compat.numpy.function.validate_any(
+            (), {"out": out, "keepdims": keepdims}
+        )
+        result = pandas.core.nanops.nanany(self._ndarray, axis=axis, skipna=skipna)
+        return result
+
+    def all(
+        self,
+        *,
+        axis: Optional[int] = None,
+        out=None,
+        keepdims: bool = False,
+        skipna: bool = True,
+    ):
+        pandas.compat.numpy.function.validate_all(
+            (), {"out": out, "keepdims": keepdims}
+        )
+        result = pandas.core.nanops.nanall(self._ndarray, axis=axis, skipna=skipna)
+        return result
+
+    def min(
+        self, *, axis: Optional[int] = None, skipna: bool = True, **kwargs
+    ) -> Scalar:
+        pandas.compat.numpy.function.validate_min((), kwargs)
+        result = pandas.core.nanops.nanmin(
+            values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna
+        )
+        return self._box_func(result)
+
+    def max(
+        self, *, axis: Optional[int] = None, skipna: bool = True, **kwargs
+    ) -> Scalar:
+        pandas.compat.numpy.function.validate_max((), kwargs)
+        result = pandas.core.nanops.nanmax(
+            values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna
+        )
+        return self._box_func(result)
+
+    if pandas_release >= (1, 2):
+
+        def median(
+            self,
+            *,
+            axis: Optional[int] = None,
+            out=None,
+            overwrite_input: bool = False,
+            keepdims: bool = False,
+            skipna: bool = True,
+        ):
+            pandas.compat.numpy.function.validate_median(
+                (),
+                {"out": out, "overwrite_input": overwrite_input, "keepdims": keepdims},
+            )
+            result = pandas.core.nanops.nanmedian(
+                self._ndarray, axis=axis, skipna=skipna
+            )
+            return self._box_func(result)