Skip to content

Commit

Permalink
CLN: Enforce read_csv(keep_date_col, parse_dates) deprecations (#58622)
Browse files Browse the repository at this point in the history
* CLN: Enforce read_csv(keep_date_col, parse_dates) deprecations

* Add whatsnew, address other tests

* Remove unnecessary reference

* inline function

* Remove os.remove

* Address html and xml tests

* Typo

* Simplify _process_date_conversion

* Remove _get_complex_date_index

* Remove concat arrays for csv

* Unexfail test

* Remove convert to unicode
  • Loading branch information
mroeschke committed May 10, 2024
1 parent 24182c2 commit a17d449
Show file tree
Hide file tree
Showing 16 changed files with 102 additions and 1,535 deletions.
10 changes: 0 additions & 10 deletions asv_bench/benchmarks/io/csv.py
Expand Up @@ -445,16 +445,6 @@ def setup(self, engine):
data = data.format(*two_cols)
self.StringIO_input = StringIO(data)

def time_multiple_date(self, engine):
read_csv(
self.data(self.StringIO_input),
engine=engine,
sep=",",
header=None,
names=list(string.digits[:9]),
parse_dates=[[1, 2], [1, 3]],
)

def time_baseline(self, engine):
read_csv(
self.data(self.StringIO_input),
Expand Down
25 changes: 1 addition & 24 deletions asv_bench/benchmarks/io/parsers.py
@@ -1,10 +1,5 @@
import numpy as np

try:
from pandas._libs.tslibs.parsing import (
_does_string_look_like_datetime,
concat_date_cols,
)
from pandas._libs.tslibs.parsing import _does_string_look_like_datetime
except ImportError:
# Avoid whole benchmark suite import failure on asv (currently 0.4)
pass
Expand All @@ -20,21 +15,3 @@ def setup(self, value):
def time_check_datetimes(self, value):
for obj in self.objects:
_does_string_look_like_datetime(obj)


class ConcatDateCols:
params = ([1234567890, "AAAA"], [1, 2])
param_names = ["value", "dim"]

def setup(self, value, dim):
count_elem = 10000
if dim == 1:
self.object = (np.array([value] * count_elem),)
if dim == 2:
self.object = (
np.array([value] * count_elem),
np.array([value] * count_elem),
)

def time_check_concat(self, value, dim):
concat_date_cols(self.object)
81 changes: 1 addition & 80 deletions doc/source/user_guide/io.rst
Expand Up @@ -262,15 +262,9 @@ parse_dates : boolean or list of ints or names or list of lists or dict, default
* If ``True`` -> try parsing the index.
* If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date
column.
* If ``[[1, 3]]`` -> combine columns 1 and 3 and parse as a single date
column.
* If ``{'foo': [1, 3]}`` -> parse columns 1, 3 as date and call result 'foo'.

.. note::
A fast-path exists for iso8601-formatted dates.
keep_date_col : boolean, default ``False``
If ``True`` and parse_dates specifies combining multiple columns then keep the
original columns.
date_format : str or dict of column -> format, default ``None``
If used in conjunction with ``parse_dates``, will parse dates according to this
format. For anything more complex,
Expand Down Expand Up @@ -802,71 +796,8 @@ The simplest case is to just pass in ``parse_dates=True``:
It is often the case that we may want to store date and time data separately,
or store various date fields separately. the ``parse_dates`` keyword can be
used to specify a combination of columns to parse the dates and/or times from.

You can specify a list of column lists to ``parse_dates``, the resulting date
columns will be prepended to the output (so as to not affect the existing column
order) and the new column names will be the concatenation of the component
column names:

.. ipython:: python
:okwarning:
data = (
"KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
"KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
"KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
"KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
"KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
"KORD,19990127, 23:00:00, 22:56:00, -0.5900"
)
with open("tmp.csv", "w") as fh:
fh.write(data)
df = pd.read_csv("tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]])
df
By default the parser removes the component date columns, but you can choose
to retain them via the ``keep_date_col`` keyword:

.. ipython:: python
:okwarning:
df = pd.read_csv(
"tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True
)
df
used to specify columns to parse the dates and/or times.

Note that if you wish to combine multiple columns into a single date column, a
nested list must be used. In other words, ``parse_dates=[1, 2]`` indicates that
the second and third columns should each be parsed as separate date columns
while ``parse_dates=[[1, 2]]`` means the two columns should be parsed into a
single column.

You can also use a dict to specify custom name columns:

.. ipython:: python
:okwarning:
date_spec = {"nominal": [1, 2], "actual": [1, 3]}
df = pd.read_csv("tmp.csv", header=None, parse_dates=date_spec)
df
It is important to remember that if multiple text columns are to be parsed into
a single date column, then a new column is prepended to the data. The ``index_col``
specification is based off of this new set of columns rather than the original
data columns:


.. ipython:: python
:okwarning:
date_spec = {"nominal": [1, 2], "actual": [1, 3]}
df = pd.read_csv(
"tmp.csv", header=None, parse_dates=date_spec, index_col=0
) # index is the nominal column
df

.. note::
If a column or index contains an unparsable date, the entire column or
Expand All @@ -880,10 +811,6 @@ data columns:
for your data to store datetimes in this format, load times will be
significantly faster, ~20x has been observed.

.. deprecated:: 2.2.0
Combining date columns inside read_csv is deprecated. Use ``pd.to_datetime``
on the relevant result columns instead.


Date parsing functions
++++++++++++++++++++++
Expand All @@ -899,12 +826,6 @@ Performance-wise, you should try these methods of parsing dates in order:
then use ``to_datetime``.


.. ipython:: python
:suppress:
os.remove("tmp.csv")
.. _io.csv.mixed_timezones:

Parsing a CSV with mixed timezones
Expand Down
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Expand Up @@ -260,8 +260,10 @@ Removal of prior version deprecations/changes
- Enforced deprecation of ``axis=None`` acting the same as ``axis=0`` in the DataFrame reductions ``sum``, ``prod``, ``std``, ``var``, and ``sem``, passing ``axis=None`` will now reduce over both axes; this is particularly the case when doing e.g. ``numpy.sum(df)`` (:issue:`21597`)
- Enforced deprecation of ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock`` (:issue:`58467`)
- Enforced deprecation of ``date_parser`` in :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_excel` in favour of ``date_format`` (:issue:`50601`)
- Enforced deprecation of ``keep_date_col`` keyword in :func:`read_csv` (:issue:`55569`)
- Enforced deprecation of ``quantile`` keyword in :meth:`.Rolling.quantile` and :meth:`.Expanding.quantile`, renamed to ``q`` instead. (:issue:`52550`)
- Enforced deprecation of argument ``infer_datetime_format`` in :func:`read_csv`, as a strict version of it is now the default (:issue:`48621`)
- Enforced deprecation of combining parsed datetime columns in :func:`read_csv` in ``parse_dates`` (:issue:`55569`)
- Enforced deprecation of non-standard (``np.ndarray``, :class:`ExtensionArray`, :class:`Index`, or :class:`Series`) argument to :func:`api.extensions.take` (:issue:`52981`)
- Enforced deprecation of parsing system timezone strings to ``tzlocal``, which depended on system timezone, pass the 'tz' keyword instead (:issue:`50791`)
- Enforced deprecation of passing a dictionary to :meth:`SeriesGroupBy.agg` (:issue:`52268`)
Expand Down
3 changes: 0 additions & 3 deletions pandas/_libs/tslibs/parsing.pyi
Expand Up @@ -27,7 +27,4 @@ def guess_datetime_format(
dt_str: str,
dayfirst: bool | None = ...,
) -> str | None: ...
def concat_date_cols(
date_cols: tuple,
) -> npt.NDArray[np.object_]: ...
def get_rule_month(source: str) -> str: ...
123 changes: 1 addition & 122 deletions pandas/_libs/tslibs/parsing.pyx
Expand Up @@ -7,7 +7,6 @@ import warnings

from pandas.util._exceptions import find_stack_level

cimport cython
from cpython.datetime cimport (
datetime,
datetime_new,
Expand All @@ -18,7 +17,6 @@ from cpython.datetime cimport (

from datetime import timezone

from cpython.object cimport PyObject_Str
from cpython.unicode cimport PyUnicode_AsUTF8AndSize
from cython cimport Py_ssize_t
from libc.string cimport strchr
Expand All @@ -28,15 +26,7 @@ import_datetime()
import numpy as np

cimport numpy as cnp
from numpy cimport (
PyArray_GETITEM,
PyArray_ITER_DATA,
PyArray_ITER_NEXT,
PyArray_IterNew,
flatiter,
float64_t,
int64_t,
)
from numpy cimport int64_t

cnp.import_array()

Expand Down Expand Up @@ -75,8 +65,6 @@ import_pandas_datetime()

from pandas._libs.tslibs.strptime import array_strptime

from pandas._libs.tslibs.util cimport is_array


cdef extern from "pandas/portable.h":
int getdigit_ascii(char c, int default) nogil
Expand Down Expand Up @@ -1097,115 +1085,6 @@ cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst) noexcept:
)


@cython.wraparound(False)
@cython.boundscheck(False)
cdef object convert_to_unicode(object item, bint keep_trivial_numbers):
"""
Convert `item` to str.
Parameters
----------
item : object
keep_trivial_numbers : bool
if True, then conversion (to string from integer/float zero)
is not performed
Returns
-------
str or int or float
"""
cdef:
float64_t float_item

if keep_trivial_numbers:
if isinstance(item, int):
if <int>item == 0:
return item
elif isinstance(item, float):
float_item = item
if float_item == 0.0 or float_item != float_item:
return item

if not isinstance(item, str):
item = PyObject_Str(item)

return item


@cython.wraparound(False)
@cython.boundscheck(False)
def concat_date_cols(tuple date_cols) -> np.ndarray:
"""
Concatenates elements from numpy arrays in `date_cols` into strings.

Parameters
----------
date_cols : tuple[ndarray]

Returns
-------
arr_of_rows : ndarray[object]

Examples
--------
>>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object)
>>> times=np.array(['11:20', '10:45'], dtype=object)
>>> result = concat_date_cols((dates, times))
>>> result
array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object)
"""
cdef:
Py_ssize_t rows_count = 0, col_count = len(date_cols)
Py_ssize_t col_idx, row_idx
list list_to_join
cnp.ndarray[object] iters
object[::1] iters_view
flatiter it
cnp.ndarray[object] result
object[::1] result_view

if col_count == 0:
return np.zeros(0, dtype=object)

if not all(is_array(array) for array in date_cols):
raise ValueError("not all elements from date_cols are numpy arrays")

rows_count = min(len(array) for array in date_cols)
result = np.zeros(rows_count, dtype=object)
result_view = result

if col_count == 1:
array = date_cols[0]
it = <flatiter>PyArray_IterNew(array)
for row_idx in range(rows_count):
item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
result_view[row_idx] = convert_to_unicode(item, True)
PyArray_ITER_NEXT(it)
else:
# create fixed size list - more efficient memory allocation
list_to_join = [None] * col_count
iters = np.zeros(col_count, dtype=object)

# create memoryview of iters ndarray, that will contain some
# flatiter's for each array in `date_cols` - more efficient indexing
iters_view = iters
for col_idx, array in enumerate(date_cols):
iters_view[col_idx] = PyArray_IterNew(array)

# array elements that are on the same line are converted to one string
for row_idx in range(rows_count):
for col_idx, array in enumerate(date_cols):
# this cast is needed, because we did not find a way
# to efficiently store `flatiter` type objects in ndarray
it = <flatiter>iters_view[col_idx]
item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
list_to_join[col_idx] = convert_to_unicode(item, False)
PyArray_ITER_NEXT(it)
result_view[row_idx] = " ".join(list_to_join)

return result


cpdef str get_rule_month(str source):
"""
Return starting month of given freq, default is December.
Expand Down
4 changes: 2 additions & 2 deletions pandas/io/parsers/arrow_parser_wrapper.py
Expand Up @@ -174,8 +174,8 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
self.names = list(range(num_cols - len(self.names))) + self.names
multi_index_named = False
frame.columns = self.names
# we only need the frame not the names
_, frame = self._do_date_conversions(frame.columns, frame)

frame = self._do_date_conversions(frame.columns, frame)
if self.index_col is not None:
index_to_set = self.index_col.copy()
for i, item in enumerate(self.index_col):
Expand Down

0 comments on commit a17d449

Please sign in to comment.