Skip to content

Commit

Permalink
fix: raise ValueError if date is out-of-bounds (#46)
Browse files Browse the repository at this point in the history
* fix: raise ValueError if date is out-of-bounds

* unify _datetime return type

* add relevant unit test
  • Loading branch information
tswast committed Dec 4, 2021
1 parent 42109ed commit 4253358
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 28 deletions.
50 changes: 30 additions & 20 deletions db_dtypes/__init__.py
Expand Up @@ -17,7 +17,7 @@

import datetime
import re
from typing import Union
from typing import Optional, Union

import numpy
import packaging.version
Expand Down Expand Up @@ -103,7 +103,7 @@ def _datetime(
r"(?::(?P<seconds>\d+)"
r"(?:\.(?P<fraction>\d*))?)?)?\s*$"
).match,
):
) -> Optional[numpy.datetime64]:
# Convert pyarrow values to datetime.time.
if isinstance(scalar, (pyarrow.Time32Scalar, pyarrow.Time64Scalar)):
scalar = (
Expand All @@ -115,8 +115,16 @@ def _datetime(

if scalar is None:
return None
elif isinstance(scalar, datetime.time):
return datetime.datetime.combine(_EPOCH, scalar)
if isinstance(scalar, datetime.time):
return pandas.Timestamp(
year=1970,
month=1,
day=1,
hour=scalar.hour,
minute=scalar.minute,
second=scalar.second,
microsecond=scalar.microsecond,
).to_datetime64()
elif isinstance(scalar, pandas.Timestamp):
return scalar.to_datetime64()
elif isinstance(scalar, str):
Expand All @@ -125,20 +133,20 @@ def _datetime(
if not parsed:
raise ValueError(f"Bad time string: {repr(scalar)}")

hours = parsed.group("hours")
minutes = parsed.group("minutes")
seconds = parsed.group("seconds")
hour = parsed.group("hours")
minute = parsed.group("minutes")
second = parsed.group("seconds")
fraction = parsed.group("fraction")
microseconds = int(fraction.ljust(6, "0")[:6]) if fraction else 0
return datetime.datetime(
1970,
1,
1,
int(hours),
int(minutes) if minutes else 0,
int(seconds) if seconds else 0,
microseconds,
)
nanosecond = int(fraction.ljust(9, "0")[:9]) if fraction else 0
return pandas.Timestamp(
year=1970,
month=1,
day=1,
hour=int(hour),
minute=int(minute) if minute else 0,
second=int(second) if second else 0,
nanosecond=nanosecond,
).to_datetime64()
else:
raise TypeError("Invalid value type", scalar)

Expand Down Expand Up @@ -225,23 +233,25 @@ class DateArray(core.BaseDatetimeArray):
def _datetime(
scalar,
match_fn=re.compile(r"\s*(?P<year>\d+)-(?P<month>\d+)-(?P<day>\d+)\s*$").match,
):
) -> Optional[numpy.datetime64]:
# Convert pyarrow values to datetime.date.
if isinstance(scalar, (pyarrow.Date32Scalar, pyarrow.Date64Scalar)):
scalar = scalar.as_py()

if scalar is None:
return None
elif isinstance(scalar, datetime.date):
return datetime.datetime(scalar.year, scalar.month, scalar.day)
return pandas.Timestamp(
year=scalar.year, month=scalar.month, day=scalar.day
).to_datetime64()
elif isinstance(scalar, str):
match = match_fn(scalar)
if not match:
raise ValueError(f"Bad date string: {repr(scalar)}")
year = int(match.group("year"))
month = int(match.group("month"))
day = int(match.group("day"))
return datetime.datetime(year, month, day)
return pandas.Timestamp(year=year, month=month, day=day).to_datetime64()
else:
raise TypeError("Invalid value type", scalar)

Expand Down
4 changes: 1 addition & 3 deletions db_dtypes/core.py
Expand Up @@ -127,9 +127,7 @@ def take(
if allow_fill:
fill_value = self._validate_scalar(fill_value)
fill_value = (
numpy.datetime64()
if fill_value is None
else numpy.datetime64(self._datetime(fill_value))
numpy.datetime64() if fill_value is None else self._datetime(fill_value)
)
if (indices < -1).any():
raise ValueError(
Expand Down
25 changes: 20 additions & 5 deletions tests/unit/test_arrow.py
Expand Up @@ -183,13 +183,13 @@ def types_mapper(
type=pyarrow.time64("us"),
),
),
(
# Only microseconds are supported when reading data. See:
# https://github.com/googleapis/python-db-dtypes-pandas/issues/19
# Still, round-trip with pyarrow nanosecond precision scalars
# is supported.
pytest.param(
pandas.Series(
[
# Only microseconds are supported when reading data. See:
# https://github.com/googleapis/python-db-dtypes-pandas/issues/19
# Still, round-trip with pyarrow nanosecond precision scalars
# is supported.
pyarrow.scalar(0, pyarrow.time64("ns")),
pyarrow.scalar(
12 * HOUR_NANOS
Expand All @@ -216,6 +216,21 @@ def types_mapper(
],
type=pyarrow.time64("ns"),
),
id="time-nanoseconds-arrow-round-trip",
),
pytest.param(
pandas.Series(
["0:0:0", "12:30:15.123456789", "23:59:59.999999999"], dtype="dbtime",
),
pyarrow.array(
[
0,
12 * HOUR_NANOS + 30 * MINUTE_NANOS + 15 * SECOND_NANOS + 123_456_789,
23 * HOUR_NANOS + 59 * MINUTE_NANOS + 59 * SECOND_NANOS + 999_999_999,
],
type=pyarrow.time64("ns"),
),
id="time-nanoseconds-arrow-from-string",
),
]

Expand Down
5 changes: 5 additions & 0 deletions tests/unit/test_date.py
Expand Up @@ -55,6 +55,11 @@ def test_date_parsing(value, expected):
("2021-2-99", "day is out of range for month"),
("2021-99-1", "month must be in 1[.][.]12"),
("10000-1-1", "year 10000 is out of range"),
# Outside of min/max values pandas.Timestamp.
("0001-01-01", "Out of bounds"),
("9999-12-31", "Out of bounds"),
("1677-09-21", "Out of bounds"),
("2262-04-12", "Out of bounds"),
],
)
def test_date_parsing_errors(value, error):
Expand Down

0 comments on commit 4253358

Please sign in to comment.