Merge pull request #11216 from jreback/datetime_with_tz

BUG: edge case when reading from postgresl with read_sql_query and datetime with tz and chunksize
pandas-dev · Oct 3, 2015 · 071cffd · 071cffd
2 parents d6c7a3a + bd26dec
commit 071cffd
Show file tree

Hide file tree

Showing 2 changed files with 90 additions and 28 deletions.
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
@@ -18,6 +18,7 @@
 from pandas.core.api import DataFrame, Series
 from pandas.core.common import isnull
 from pandas.core.base import PandasObject
+from pandas.core.dtypes import DatetimeTZDtype
 from pandas.tseries.tools import to_datetime
 from pandas.util.decorators import Appender
 
@@ -89,6 +90,10 @@ def _handle_date_column(col, format=None):
             # parse dates as timestamp
             format = 's' if format is None else format
             return to_datetime(col, errors='coerce', unit=format, utc=True)
+        elif com.is_datetime64tz_dtype(col):
+            # coerce to UTC timezone
+            # GH11216
+            return to_datetime(col,errors='coerce').astype('datetime64[ns, UTC]')
         else:
             return to_datetime(col, errors='coerce', format=format, utc=True)
 
@@ -113,6 +118,14 @@ def _parse_date_columns(data_frame, parse_dates):
             fmt = None
         data_frame[col_name] = _handle_date_column(df_col, format=fmt)
 
+
+    # we want to coerce datetime64_tz dtypes for now
+    # we could in theory do a 'nice' conversion from a FixedOffset tz
+    # GH11216
+    for col_name, df_col in data_frame.iteritems():
+        if com.is_datetime64tz_dtype(df_col):
+            data_frame[col_name] = _handle_date_column(df_col)
+
     return data_frame
 
 
@@ -366,7 +379,7 @@ def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None,
     ----------
     sql : string SQL query or SQLAlchemy Selectable (select or text object)
         to be executed.
-    con : SQLAlchemy connectable(engine/connection) or database string URI 
+    con : SQLAlchemy connectable(engine/connection) or database string URI
         or sqlite3 DBAPI2 connection
         Using SQLAlchemy makes it possible to use any DB supported by that
         library.
@@ -898,11 +911,10 @@ def _harmonize_columns(self, parse_dates=None):
             try:
                 df_col = self.frame[col_name]
                 # the type the dataframe column should have
-                col_type = self._numpy_type(sql_col.type)
+                col_type = self._get_dtype(sql_col.type)
 
-                if col_type is datetime or col_type is date:
-                    if not issubclass(df_col.dtype.type, np.datetime64):
-                        self.frame[col_name] = _handle_date_column(df_col)
+                if col_type is datetime or col_type is date or col_type is DatetimeTZDtype:
+                    self.frame[col_name] = _handle_date_column(df_col)
 
                 elif col_type is float:
                     # floats support NA, can always convert!
@@ -982,20 +994,25 @@ def _sqlalchemy_type(self, col):
 
         return Text
 
-    def _numpy_type(self, sqltype):
-        from sqlalchemy.types import Integer, Float, Boolean, DateTime, Date
+    def _get_dtype(self, sqltype):
+        from sqlalchemy.types import Integer, Float, Boolean, DateTime, Date, TIMESTAMP
 
         if isinstance(sqltype, Float):
             return float
-        if isinstance(sqltype, Integer):
+        elif isinstance(sqltype, Integer):
             # TODO: Refine integer size.
             return np.dtype('int64')
-        if isinstance(sqltype, DateTime):
+        elif isinstance(sqltype, TIMESTAMP):
+            # we have a timezone capable type
+            if not sqltype.timezone:
+                return datetime
+            return DatetimeTZDtype
+        elif isinstance(sqltype, DateTime):
             # Caution: np.datetime64 is also a subclass of np.number.
             return datetime
-        if isinstance(sqltype, Date):
+        elif isinstance(sqltype, Date):
             return date
-        if isinstance(sqltype, Boolean):
+        elif isinstance(sqltype, Boolean):
             return bool
         return object
 

diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py
@@ -26,13 +26,15 @@
 import nose
 import warnings
 import numpy as np
+import pandas as pd
 
 from datetime import datetime, date, time
 
 from pandas import DataFrame, Series, Index, MultiIndex, isnull, concat
 from pandas import date_range, to_datetime, to_timedelta, Timestamp
 import pandas.compat as compat
 from pandas.compat import StringIO, range, lrange, string_types
+from pandas.core import common as com
 from pandas.core.datetools import format as date_format
 
 import pandas.io.sql as sql
@@ -1248,6 +1250,66 @@ def test_default_date_load(self):
         self.assertTrue(issubclass(df.DateCol.dtype.type, np.datetime64),
                         "DateCol loaded with incorrect type")
 
+    def test_datetime_with_timezone(self):
+        # edge case that converts postgresql datetime with time zone types
+        # to datetime64[ns,psycopg2.tz.FixedOffsetTimezone..], which is ok
+        # but should be more natural, so coerce to datetime64[ns] for now
+
+        def check(col):
+            # check that a column is either datetime64[ns]
+            # or datetime64[ns, UTC]
+            if com.is_datetime64_dtype(col.dtype):
+
+                # "2000-01-01 00:00:00-08:00" should convert to "2000-01-01 08:00:00"
+                self.assertEqual(col[0], Timestamp('2000-01-01 08:00:00'))
+
+                # "2000-06-01 00:00:00-07:00" should convert to "2000-06-01 07:00:00"
+                self.assertEqual(col[1], Timestamp('2000-06-01 07:00:00'))
+
+            elif com.is_datetime64tz_dtype(col.dtype):
+                self.assertTrue(str(col.dt.tz) == 'UTC')
+
+                # "2000-01-01 00:00:00-08:00" should convert to "2000-01-01 08:00:00"
+                self.assertEqual(col[0], Timestamp('2000-01-01 08:00:00', tz='UTC'))
+
+                # "2000-06-01 00:00:00-07:00" should convert to "2000-06-01 07:00:00"
+                self.assertEqual(col[1], Timestamp('2000-06-01 07:00:00', tz='UTC'))
+
+            else:
+                raise AssertionError("DateCol loaded with incorrect type -> {0}".format(col.dtype))
+
+        # GH11216
+        df = pd.read_sql_query("select * from types_test_data", self.conn)
+        if not hasattr(df,'DateColWithTz'):
+            raise nose.SkipTest("no column with datetime with time zone")
+
+        # this is parsed on Travis (linux), but not on macosx for some reason
+        # even with the same versions of psycopg2 & sqlalchemy, possibly a Postgrsql server
+        # version difference
+        col = df.DateColWithTz
+        self.assertTrue(com.is_object_dtype(col.dtype) or com.is_datetime64_dtype(col.dtype) \
+                        or com.is_datetime64tz_dtype(col.dtype),
+                        "DateCol loaded with incorrect type -> {0}".format(col.dtype))
+
+        df = pd.read_sql_query("select * from types_test_data", self.conn, parse_dates=['DateColWithTz'])
+        if not hasattr(df,'DateColWithTz'):
+            raise nose.SkipTest("no column with datetime with time zone")
+        check(df.DateColWithTz)
+
+        df = pd.concat(list(pd.read_sql_query("select * from types_test_data",
+                                              self.conn,chunksize=1)),ignore_index=True)
+        col = df.DateColWithTz
+        self.assertTrue(com.is_datetime64tz_dtype(col.dtype),
+                        "DateCol loaded with incorrect type -> {0}".format(col.dtype))
+        self.assertTrue(str(col.dt.tz) == 'UTC')
+        expected = sql.read_sql_table("types_test_data", self.conn)
+        tm.assert_series_equal(df.DateColWithTz, expected.DateColWithTz.astype('datetime64[ns, UTC]'))
+
+        # xref #7139
+        # this might or might not be converted depending on the postgres driver
+        df = sql.read_sql_table("types_test_data", self.conn)
+        check(df.DateColWithTz)
+
     def test_date_parsing(self):
         # No Parsing
         df = sql.read_sql_table("types_test_data", self.conn)
@@ -1746,23 +1808,6 @@ def test_schema_support(self):
             res2 = pdsql.read_table('test_schema_other2')
             tm.assert_frame_equal(res1, res2)
 
-    def test_datetime_with_time_zone(self):
-
-        # Test to see if we read the date column with timezones that
-        # the timezone information is converted to utc and into a
-        # np.datetime64 (GH #7139)
-
-        df = sql.read_sql_table("types_test_data", self.conn)
-        self.assertTrue(issubclass(df.DateColWithTz.dtype.type, np.datetime64),
-                        "DateColWithTz loaded with incorrect type -> {0}".format(df.DateColWithTz.dtype))
-
-        # "2000-01-01 00:00:00-08:00" should convert to "2000-01-01 08:00:00"
-        self.assertEqual(df.DateColWithTz[0], Timestamp('2000-01-01 08:00:00'))
-
-        # "2000-06-01 00:00:00-07:00" should convert to "2000-06-01 07:00:00"
-        self.assertEqual(df.DateColWithTz[1], Timestamp('2000-06-01 07:00:00'))
-
-
 class TestMySQLAlchemy(_TestMySQLAlchemy, _TestSQLAlchemy):
     pass