CLN: Enforce read_csv(keep_date_col, parse_dates) deprecations (#58622)

* CLN: Enforce read_csv(keep_date_col, parse_dates) deprecations * Add whatsnew, address other tests * Remove unnecessary reference * inline function * Remove os.remove * Address html and xml tests * Typo * Simplify _process_date_conversion * Remove _get_complex_date_index * Remove concat arrays for csv * Unexfail test * Remove convert to unicode
pandas-dev · May 10, 2024 · a17d449 · a17d449
1 parent 24182c2
commit a17d449
Show file tree

Hide file tree

Showing 16 changed files with 102 additions and 1,535 deletions.
diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
@@ -445,16 +445,6 @@ def setup(self, engine):
         data = data.format(*two_cols)
         self.StringIO_input = StringIO(data)
 
-    def time_multiple_date(self, engine):
-        read_csv(
-            self.data(self.StringIO_input),
-            engine=engine,
-            sep=",",
-            header=None,
-            names=list(string.digits[:9]),
-            parse_dates=[[1, 2], [1, 3]],
-        )
-
     def time_baseline(self, engine):
         read_csv(
             self.data(self.StringIO_input),

diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py
@@ -1,10 +1,5 @@
-import numpy as np
-
 try:
-    from pandas._libs.tslibs.parsing import (
-        _does_string_look_like_datetime,
-        concat_date_cols,
-    )
+    from pandas._libs.tslibs.parsing import _does_string_look_like_datetime
 except ImportError:
     # Avoid whole benchmark suite import failure on asv (currently 0.4)
     pass
@@ -20,21 +15,3 @@ def setup(self, value):
     def time_check_datetimes(self, value):
         for obj in self.objects:
             _does_string_look_like_datetime(obj)
-
-
-class ConcatDateCols:
-    params = ([1234567890, "AAAA"], [1, 2])
-    param_names = ["value", "dim"]
-
-    def setup(self, value, dim):
-        count_elem = 10000
-        if dim == 1:
-            self.object = (np.array([value] * count_elem),)
-        if dim == 2:
-            self.object = (
-                np.array([value] * count_elem),
-                np.array([value] * count_elem),
-            )
-
-    def time_check_concat(self, value, dim):
-        concat_date_cols(self.object)
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -262,15 +262,9 @@ parse_dates : boolean or list of ints or names or list of lists or dict, default
   * If ``True`` -> try parsing the index.
   * If ``[1, 2, 3]`` ->  try parsing columns 1, 2, 3 each as a separate date
     column.
-  * If ``[[1, 3]]`` -> combine columns 1 and 3 and parse as a single date
-    column.
-  * If ``{'foo': [1, 3]}`` -> parse columns 1, 3 as date and call result 'foo'.
 
   .. note::
      A fast-path exists for iso8601-formatted dates.
-keep_date_col : boolean, default ``False``
-  If ``True`` and parse_dates specifies combining multiple columns then keep the
-  original columns.
 date_format : str or dict of column -> format, default ``None``
    If used in conjunction with ``parse_dates``, will parse dates according to this
    format. For anything more complex,
@@ -802,71 +796,8 @@ The simplest case is to just pass in ``parse_dates=True``:
 
 It is often the case that we may want to store date and time data separately,
 or store various date fields separately. the ``parse_dates`` keyword can be
-used to specify a combination of columns to parse the dates and/or times from.
-
-You can specify a list of column lists to ``parse_dates``, the resulting date
-columns will be prepended to the output (so as to not affect the existing column
-order) and the new column names will be the concatenation of the component
-column names:
-
-.. ipython:: python
-   :okwarning:
-
-   data = (
-       "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
-       "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
-       "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
-       "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
-       "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
-       "KORD,19990127, 23:00:00, 22:56:00, -0.5900"
-   )
-
-   with open("tmp.csv", "w") as fh:
-       fh.write(data)
-
-   df = pd.read_csv("tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]])
-   df
-
-By default the parser removes the component date columns, but you can choose
-to retain them via the ``keep_date_col`` keyword:
-
-.. ipython:: python
-   :okwarning:
-
-   df = pd.read_csv(
-       "tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True
-   )
-   df
+used to specify columns to parse the dates and/or times.
 
-Note that if you wish to combine multiple columns into a single date column, a
-nested list must be used. In other words, ``parse_dates=[1, 2]`` indicates that
-the second and third columns should each be parsed as separate date columns
-while ``parse_dates=[[1, 2]]`` means the two columns should be parsed into a
-single column.
-
-You can also use a dict to specify custom name columns:
-
-.. ipython:: python
-   :okwarning:
-
-   date_spec = {"nominal": [1, 2], "actual": [1, 3]}
-   df = pd.read_csv("tmp.csv", header=None, parse_dates=date_spec)
-   df
-
-It is important to remember that if multiple text columns are to be parsed into
-a single date column, then a new column is prepended to the data. The ``index_col``
-specification is based off of this new set of columns rather than the original
-data columns:
-
-
-.. ipython:: python
-   :okwarning:
-
-   date_spec = {"nominal": [1, 2], "actual": [1, 3]}
-   df = pd.read_csv(
-       "tmp.csv", header=None, parse_dates=date_spec, index_col=0
-   )  # index is the nominal column
-   df
 
 .. note::
    If a column or index contains an unparsable date, the entire column or
@@ -880,10 +811,6 @@ data columns:
    for your data to store datetimes in this format, load times will be
    significantly faster, ~20x has been observed.
 
-.. deprecated:: 2.2.0
-   Combining date columns inside read_csv is deprecated. Use ``pd.to_datetime``
-   on the relevant result columns instead.
-
 
 Date parsing functions
 ++++++++++++++++++++++
@@ -899,12 +826,6 @@ Performance-wise, you should try these methods of parsing dates in order:
    then use ``to_datetime``.
 
 
-.. ipython:: python
-   :suppress:
-
-   os.remove("tmp.csv")
-
-
 .. _io.csv.mixed_timezones:
 
 Parsing a CSV with mixed timezones

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -260,8 +260,10 @@ Removal of prior version deprecations/changes
 - Enforced deprecation of ``axis=None`` acting the same as ``axis=0`` in the DataFrame reductions ``sum``, ``prod``, ``std``, ``var``, and ``sem``, passing ``axis=None`` will now reduce over both axes; this is particularly the case when doing e.g. ``numpy.sum(df)`` (:issue:`21597`)
 - Enforced deprecation of ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock`` (:issue:`58467`)
 - Enforced deprecation of ``date_parser`` in :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_excel` in favour of ``date_format`` (:issue:`50601`)
+- Enforced deprecation of ``keep_date_col`` keyword in :func:`read_csv` (:issue:`55569`)
 - Enforced deprecation of ``quantile`` keyword in :meth:`.Rolling.quantile` and :meth:`.Expanding.quantile`, renamed to ``q`` instead. (:issue:`52550`)
 - Enforced deprecation of argument ``infer_datetime_format`` in :func:`read_csv`, as a strict version of it is now the default (:issue:`48621`)
+- Enforced deprecation of combining parsed datetime columns in :func:`read_csv` in ``parse_dates`` (:issue:`55569`)
 - Enforced deprecation of non-standard (``np.ndarray``, :class:`ExtensionArray`, :class:`Index`, or :class:`Series`) argument to :func:`api.extensions.take` (:issue:`52981`)
 - Enforced deprecation of parsing system timezone strings to ``tzlocal``, which depended on system timezone, pass the 'tz' keyword instead (:issue:`50791`)
 - Enforced deprecation of passing a dictionary to :meth:`SeriesGroupBy.agg` (:issue:`52268`)

diff --git a/pandas/_libs/tslibs/parsing.pyi b/pandas/_libs/tslibs/parsing.pyi
@@ -27,7 +27,4 @@ def guess_datetime_format(
     dt_str: str,
     dayfirst: bool | None = ...,
 ) -> str | None: ...
-def concat_date_cols(
-    date_cols: tuple,
-) -> npt.NDArray[np.object_]: ...
 def get_rule_month(source: str) -> str: ...
diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx
@@ -7,7 +7,6 @@ import warnings
 
 from pandas.util._exceptions import find_stack_level
 
-cimport cython
 from cpython.datetime cimport (
     datetime,
     datetime_new,
@@ -18,7 +17,6 @@ from cpython.datetime cimport (
 
 from datetime import timezone
 
-from cpython.object cimport PyObject_Str
 from cpython.unicode cimport PyUnicode_AsUTF8AndSize
 from cython cimport Py_ssize_t
 from libc.string cimport strchr
@@ -28,15 +26,7 @@ import_datetime()
 import numpy as np
 
 cimport numpy as cnp
-from numpy cimport (
-    PyArray_GETITEM,
-    PyArray_ITER_DATA,
-    PyArray_ITER_NEXT,
-    PyArray_IterNew,
-    flatiter,
-    float64_t,
-    int64_t,
-)
+from numpy cimport int64_t
 
 cnp.import_array()
 
@@ -75,8 +65,6 @@ import_pandas_datetime()
 
 from pandas._libs.tslibs.strptime import array_strptime
 
-from pandas._libs.tslibs.util cimport is_array
-
 
 cdef extern from "pandas/portable.h":
     int getdigit_ascii(char c, int default) nogil
@@ -1097,115 +1085,6 @@ cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst) noexcept:
             )
 
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef object convert_to_unicode(object item, bint keep_trivial_numbers):
-    """
-    Convert `item` to str.
-
-    Parameters
-    ----------
-    item : object
-    keep_trivial_numbers : bool
-        if True, then conversion (to string from integer/float zero)
-        is not performed
-
-    Returns
-    -------
-    str or int or float
-    """
-    cdef:
-        float64_t float_item
-
-    if keep_trivial_numbers:
-        if isinstance(item, int):
-            if <int>item == 0:
-                return item
-        elif isinstance(item, float):
-            float_item = item
-            if float_item == 0.0 or float_item != float_item:
-                return item
-
-    if not isinstance(item, str):
-        item = PyObject_Str(item)
-
-    return item
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def concat_date_cols(tuple date_cols) -> np.ndarray:
-    """
-    Concatenates elements from numpy arrays in `date_cols` into strings.
-
-    Parameters
-    ----------
-    date_cols : tuple[ndarray]
-
-    Returns
-    -------
-    arr_of_rows : ndarray[object]
-
-    Examples
-    --------
-    >>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object)
-    >>> times=np.array(['11:20', '10:45'], dtype=object)
-    >>> result = concat_date_cols((dates, times))
-    >>> result
-    array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object)
-    """
-    cdef:
-        Py_ssize_t rows_count = 0, col_count = len(date_cols)
-        Py_ssize_t col_idx, row_idx
-        list list_to_join
-        cnp.ndarray[object] iters
-        object[::1] iters_view
-        flatiter it
-        cnp.ndarray[object] result
-        object[::1] result_view
-
-    if col_count == 0:
-        return np.zeros(0, dtype=object)
-
-    if not all(is_array(array) for array in date_cols):
-        raise ValueError("not all elements from date_cols are numpy arrays")
-
-    rows_count = min(len(array) for array in date_cols)
-    result = np.zeros(rows_count, dtype=object)
-    result_view = result
-
-    if col_count == 1:
-        array = date_cols[0]
-        it = <flatiter>PyArray_IterNew(array)
-        for row_idx in range(rows_count):
-            item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
-            result_view[row_idx] = convert_to_unicode(item, True)
-            PyArray_ITER_NEXT(it)
-    else:
-        # create fixed size list - more efficient memory allocation
-        list_to_join = [None] * col_count
-        iters = np.zeros(col_count, dtype=object)
-
-        # create memoryview of iters ndarray, that will contain some
-        # flatiter's for each array in `date_cols` - more efficient indexing
-        iters_view = iters
-        for col_idx, array in enumerate(date_cols):
-            iters_view[col_idx] = PyArray_IterNew(array)
-
-        # array elements that are on the same line are converted to one string
-        for row_idx in range(rows_count):
-            for col_idx, array in enumerate(date_cols):
-                # this cast is needed, because we did not find a way
-                # to efficiently store `flatiter` type objects in ndarray
-                it = <flatiter>iters_view[col_idx]
-                item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
-                list_to_join[col_idx] = convert_to_unicode(item, False)
-                PyArray_ITER_NEXT(it)
-            result_view[row_idx] = " ".join(list_to_join)
-
-    return result
-
-
 cpdef str get_rule_month(str source):
     """
     Return starting month of given freq, default is December.

diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -174,8 +174,8 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
                 self.names = list(range(num_cols - len(self.names))) + self.names
                 multi_index_named = False
             frame.columns = self.names
-        # we only need the frame not the names
-        _, frame = self._do_date_conversions(frame.columns, frame)
+
+        frame = self._do_date_conversions(frame.columns, frame)
         if self.index_col is not None:
             index_to_set = self.index_col.copy()
             for i, item in enumerate(self.index_col):