From 6219fb8d8d19d234e386613d3952aa6b40cc23aa Mon Sep 17 00:00:00 2001 From: zhezherun <43962586+zhezherun@users.noreply.github.com> Date: Mon, 19 Nov 2018 12:09:40 +0000 Subject: [PATCH] BUG: Fixing memory leaks in read_csv (#23072) * Move allocation of na_hashset down to avoid a leak on continue * Delete na_hashset if there is an exception * Clean up table before raising an exception Closes gh-21353. --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/_libs/parsers.pyx | 46 ++++++++++++++++++++------------- 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index bb02bbb36424a5..b9abf9293079fb 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1382,6 +1382,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form - Bug in :func:`DataFrame.to_string()` that caused representations of :class:`DataFrame` to not take up the whole window (:issue:`22984`) - Bug in :func:`DataFrame.to_csv` where a single level MultiIndex incorrectly wrote a tuple. Now just the value of the index is written (:issue:`19589`). - Bug in :meth:`HDFStore.append` when appending a :class:`DataFrame` with an empty string column and ``min_itemsize`` < 8 (:issue:`12242`) +- Bug in :func:`read_csv()` in which memory leaks occurred in the C engine when parsing ``NaN`` values due to insufficient cleanup on completion or error (:issue:`21353`) - Bug in :func:`read_csv()` in which incorrect error messages were being raised when ``skipfooter`` was passed in along with ``nrows``, ``iterator``, or ``chunksize`` (:issue:`23711`) - Bug in :meth:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`) - Bug in :meth:`read_html()` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 40aa03caa56eb1..f74de79542628d 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1070,18 +1070,6 @@ cdef class TextReader: conv = self._get_converter(i, name) - # XXX - na_flist = set() - if self.na_filter: - na_list, na_flist = self._get_na_list(i, name) - if na_list is None: - na_filter = 0 - else: - na_filter = 1 - na_hashset = kset_from_list(na_list) - else: - na_filter = 0 - col_dtype = None if self.dtype is not None: if isinstance(self.dtype, dict): @@ -1106,13 +1094,34 @@ cdef class TextReader: self.c_encoding) continue - # Should return as the desired dtype (inferred or specified) - col_res, na_count = self._convert_tokens( - i, start, end, name, na_filter, na_hashset, - na_flist, col_dtype) + # Collect the list of NaN values associated with the column. + # If we aren't supposed to do that, or none are collected, + # we set `na_filter` to `0` (`1` otherwise). + na_flist = set() + + if self.na_filter: + na_list, na_flist = self._get_na_list(i, name) + if na_list is None: + na_filter = 0 + else: + na_filter = 1 + na_hashset = kset_from_list(na_list) + else: + na_filter = 0 - if na_filter: - self._free_na_set(na_hashset) + # Attempt to parse tokens and infer dtype of the column. + # Should return as the desired dtype (inferred or specified). + try: + col_res, na_count = self._convert_tokens( + i, start, end, name, na_filter, na_hashset, + na_flist, col_dtype) + finally: + # gh-21353 + # + # Cleanup the NaN hash that we generated + # to avoid memory leaks. + if na_filter: + self._free_na_set(na_hashset) if upcast_na and na_count > 0: col_res = _maybe_upcast(col_res) @@ -2059,6 +2068,7 @@ cdef kh_str_t* kset_from_list(list values) except NULL: # None creeps in sometimes, which isn't possible here if not isinstance(val, bytes): + kh_destroy_str(table) raise ValueError('Must be all encoded bytes') k = kh_put_str(table, PyBytes_AsString(val), &ret)