From 419e6de2d7f4a5cf5a2cdd9e134b0f0aad56cd84 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Fri, 27 Aug 2021 14:55:52 +0100 Subject: [PATCH 01/55] Set all daskified decorator log levels to default, to adapt as req. --- cf/data/data.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index 020a09b964..70c7ba95cb 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -642,7 +642,7 @@ def dask_compressed_array(self): return ca._get_dask().copy() - @daskified(1) + @daskified() def __contains__(self, value): """Membership test operator ``in`` @@ -934,7 +934,7 @@ def __repr__(self): """ return super().__repr__().replace("<", " Date: Fri, 27 Aug 2021 17:28:59 +0100 Subject: [PATCH 02/55] Add basic (excluding parameters) unit test for Data.equals --- cf/test/test_Data.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 08de4a7750..30110ff410 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -131,6 +131,41 @@ class DataTest(unittest.TestCase): # test_only = ['test_Data_clip'] # test_only = ['test_Data__init__dtype_mask'] + def test_Data_equals(self): + if self.test_only and inspect.stack()[0][3] not in self.test_only: + return + + shape = 3, 4 + a = np.arange(12).reshape(*shape) + + d = cf.Data(a, "m") + self.assertTrue(d.equals(d)) # trivial check + + d2 = cf.Data(a.astype(np.float32), "m") + self.assertFalse(d2.equals(d)) # due to different datatype + + e = cf.Data(a, "s") + self.assertFalse(e.equals(d)) # due to different units + + f = cf.Data(np.arange(12)) + self.assertFalse(f.equals(d)) # due to different shape + + g = cf.Data(np.ones(shape)) + self.assertFalse(g.equals(d)) # due to different element value(s) + + # Test NaN and inf values + h = cf.Data(np.full(shape, np.nan)) + self.assertFalse(h.equals(d)) + i = cf.Data(np.full(shape, np.inf)) + self.assertFalse(i.equals(d)) + self.assertFalse(h.equals(i)) + + # Test masked arrays + j = cf.Data(np.ma.masked_all(shape)) + self.assertFalse(j.equals(d)) + + # TODODASK Test equals method parameters + @unittest.skipIf(TEST_DASKIFIED_ONLY, "hits unexpected kwarg 'ndim'") def test_Data_halo(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: From caad9cbcd9402d533ebf04ddf49a6748007dbf60 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Fri, 27 Aug 2021 17:49:08 +0100 Subject: [PATCH 03/55] Migrate Data.equals from LAMA to Dask --- cf/data/data.py | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index 70c7ba95cb..e90feefef0 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -32,7 +32,7 @@ _inplace_enabled_define_and_cleanup, _manage_log_level_via_verbosity, ) -from ..functions import _numpy_allclose, _numpy_isclose, _section, abspath +from ..functions import _numpy_isclose, _section, abspath from ..functions import atol as cf_atol from ..functions import broadcast_array from ..functions import chunksize as cf_chunksize @@ -9034,6 +9034,7 @@ def ndindex(self): """ return product(*[range(0, r) for r in self.shape]) + @daskified(2) @_deprecated_kwarg_check("traceback") @_manage_log_level_via_verbosity def equals( @@ -9116,26 +9117,21 @@ def equals( ) return False - config = self.partition_configuration(readonly=True) + # other.to_memory() # TODODASK is this still required? - other.to_memory() + self_dx = self._get_dask() + other_dx = other._get_dask() - for partition in self.partitions.matrix.flat: - partition.open(config) - array0 = partition.array - array1 = other[partition.indices].varray - partition.close() - - if not _numpy_allclose( - array0, array1, rtol=float(rtol), atol=float(atol) - ): - logger.info( - "{0}: Different array values (atol={1}, " - "rtol={2})".format(self.__class__.__name__, atol, rtol) - ) + # Finally check that corresponding elements are equal to a tolerance + if not da.allclose( + self_dx, other_dx, rtol=float(rtol), atol=float(atol) + ): + logger.info( + "{0}: Different array values (atol={1}, " + "rtol={2})".format(self.__class__.__name__, atol, rtol) + ) - return False - # --- End: for + return False # ------------------------------------------------------------ # Still here? Then the two instances are equal. From 90af93b49512ee518cc90be47096b9f92f31572e Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Tue, 31 Aug 2021 21:00:31 +0100 Subject: [PATCH 04/55] Address several DeprecationWarning warnings raised by pytest Also fix some issues in the functions module so flake8 passes --- cf/cellmethod.py | 6 +++--- cf/field.py | 2 +- cf/functions.py | 5 +++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/cf/cellmethod.py b/cf/cellmethod.py index d5899c557e..37e2ac67af 100644 --- a/cf/cellmethod.py +++ b/cf/cellmethod.py @@ -104,8 +104,8 @@ def create(cls, cell_methods_string=None): # # ['lat:', 'mean', '(', 'interval:', '1', 'hour', ')'] # ------------------------------------------------------------ - cell_methods = re.sub("\((?=[^\s])", "( ", cell_methods_string) - cell_methods = re.sub("(?<=[^\s])\)", " )", cell_methods).split() + cell_methods = re.sub(r"\((?=[^\s])", "( ", cell_methods_string) + cell_methods = re.sub(r"(?<=[^\s])\)", " )", cell_methods).split() while cell_methods: cm = cls() @@ -156,7 +156,7 @@ def create(cls, cell_methods_string=None): if not (re.search("^(interval|comment):$", cell_methods[0])): cell_methods.insert(0, "comment:") - while not re.search("^\)$", cell_methods[0]): + while not re.search(r"^\)$", cell_methods[0]): term = cell_methods.pop(0)[:-1] if term == "interval": diff --git a/cf/field.py b/cf/field.py index cf7e46ac80..a4160d3c5a 100644 --- a/cf/field.py +++ b/cf/field.py @@ -3706,7 +3706,7 @@ def _weights_geometry_volume( return True def _weights_interior_angle(self, data_lambda, data_phi): - """Find the interior angle between each adjacent pair of + r"""Find the interior angle between each adjacent pair of geometry nodes defined on a sphere. The interior angle of two points on the sphere is calculated with diff --git a/cf/functions.py b/cf/functions.py index f45ca37429..ba22e4d911 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -33,6 +33,7 @@ from numpy import ascontiguousarray as _numpy_ascontiguousarray from numpy import isclose as _x_numpy_isclose from numpy import shape as _numpy_shape +from numpy import size as _numpy_size from numpy import take as _numpy_take from numpy import tile as _numpy_tile from numpy.ma import all as _numpy_ma_all @@ -799,7 +800,7 @@ def _parse(cls, arg): class chunksize(ConstantAccess): - """Set the chunksize used by LAMA for partitioning the data array. + r"""Set the chunksize used by LAMA for partitioning the data array. This must be smaller than an upper limit determined by the free memory factor, which is the fraction of memory kept free as a @@ -2252,7 +2253,7 @@ def load_stash2standard_name(table=None, delimiter="!", merge=True): # 8 PP extra info # Number matching regular expression - number_regex = "([-+]?\d*\.?\d+(e[-+]?\d+)?)" + number_regex = r"([-+]?\d*\.?\d+(e[-+]?\d+)?)" if table is None: # Use default conversion table From 6c1fff5e156833d52db0ac5768f90d7765b7ea64 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Tue, 31 Aug 2021 21:57:29 +0100 Subject: [PATCH 05/55] Improve test by adding assertions on raised log messages --- cf/test/test_Data.py | 93 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 76 insertions(+), 17 deletions(-) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 30110ff410..7a1c05ce44 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -141,30 +141,89 @@ def test_Data_equals(self): d = cf.Data(a, "m") self.assertTrue(d.equals(d)) # trivial check - d2 = cf.Data(a.astype(np.float32), "m") - self.assertFalse(d2.equals(d)) # due to different datatype + d2 = cf.Data(a.astype(np.float32), "m") # different datatype to d + with self.assertLogs(level=cf.log_level().value) as catch: + self.assertFalse(d2.equals(d)) + self.assertTrue( + any( + "Data: Different data types: float32 != int64" in log_msg + for log_msg in catch.output + ) + ) - e = cf.Data(a, "s") - self.assertFalse(e.equals(d)) # due to different units + e = cf.Data(a, "s") # different units to d + with self.assertLogs(level=cf.log_level().value) as catch: + self.assertFalse(e.equals(d)) + self.assertTrue( + any( + "Data: Different Units (, " in log_msg + for log_msg in catch.output + ) + ) - f = cf.Data(np.arange(12)) - self.assertFalse(f.equals(d)) # due to different shape + f = cf.Data(np.arange(12), "m") # different shape to d + with self.assertLogs(level=cf.log_level().value) as catch: + self.assertFalse(f.equals(d)) + self.assertTrue( + any( + "Data: Different shapes: (12,) != (3, 4)" in log_msg + for log_msg in catch.output + ) + ) - g = cf.Data(np.ones(shape)) - self.assertFalse(g.equals(d)) # due to different element value(s) + g = cf.Data(np.ones(shape, dtype="int64"), "m") # different values + with self.assertLogs(level=cf.log_level().value) as catch: + self.assertFalse(g.equals(d)) + self.assertTrue( + any( + "Data: Different array values" in log_msg + for log_msg in catch.output + ) + ) # Test NaN and inf values - h = cf.Data(np.full(shape, np.nan)) - self.assertFalse(h.equals(d)) - i = cf.Data(np.full(shape, np.inf)) - self.assertFalse(i.equals(d)) - self.assertFalse(h.equals(i)) + h = cf.Data(np.full(shape, np.nan, dtype="int"), "m") + with self.assertLogs(level=cf.log_level().value) as catch: + self.assertFalse(h.equals(d)) + self.assertTrue( + any( + "Data: Different array values" in log_msg + for log_msg in catch.output + ) + ) + i = cf.Data(np.full(shape, np.inf, dtype="int"), "m") + with self.assertLogs(level=cf.log_level().value) as catch: + self.assertFalse(i.equals(d)) + self.assertTrue( + any( + "Data: Different array values" in log_msg + for log_msg in catch.output + ) + ) + # TODODASK: the below should eventually pass, currently doesn't since + # the two underlying arrays each are filled with the fill_value. + # + # with self.assertLogs(level=cf.log_level().value) as catch: + # print(h) + # print(i) + # self.assertFalse(h.equals(i)) + # self.assertTrue( + # any( + # "Data: Different array values" in log_msg + # for log_msg in catch.output + # ) + # ) # Test masked arrays - j = cf.Data(np.ma.masked_all(shape)) - self.assertFalse(j.equals(d)) - - # TODODASK Test equals method parameters + j = cf.Data(np.ma.masked_all(shape, dtype="int"), "m") + with self.assertLogs(level=cf.log_level().value) as catch: + self.assertFalse(j.equals(d)) + self.assertTrue( + any( + "Data: Different array values" in log_msg + for log_msg in catch.output + ) + ) @unittest.skipIf(TEST_DASKIFIED_ONLY, "hits unexpected kwarg 'ndim'") def test_Data_halo(self): From 0e916c06f29278f864737c3546cb978e0cf2d987 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Tue, 31 Aug 2021 22:56:44 +0100 Subject: [PATCH 06/55] Fix equality test w/ numpy inf & NaN by comparing to array of same dtype --- cf/test/test_Data.py | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 7a1c05ce44..b6af79c5ba 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -182,42 +182,29 @@ def test_Data_equals(self): ) # Test NaN and inf values - h = cf.Data(np.full(shape, np.nan, dtype="int"), "m") + h = cf.Data(np.full(shape, np.nan), "m") + d3 = cf.Data(a.astype(np.float64), "m") with self.assertLogs(level=cf.log_level().value) as catch: - self.assertFalse(h.equals(d)) + # Compare to d3 not d since np.nan has dtype float64 (IEEE 754) + self.assertFalse(h.equals(d3)) + print(catch.output) self.assertTrue( any( "Data: Different array values" in log_msg for log_msg in catch.output ) ) - i = cf.Data(np.full(shape, np.inf, dtype="int"), "m") + i = cf.Data(np.full(shape, np.inf), "m") with self.assertLogs(level=cf.log_level().value) as catch: - self.assertFalse(i.equals(d)) + self.assertFalse(i.equals(d3)) # np.inf is also of dtype float64 self.assertTrue( any( "Data: Different array values" in log_msg for log_msg in catch.output ) ) - # TODODASK: the below should eventually pass, currently doesn't since - # the two underlying arrays each are filled with the fill_value. - # - # with self.assertLogs(level=cf.log_level().value) as catch: - # print(h) - # print(i) - # self.assertFalse(h.equals(i)) - # self.assertTrue( - # any( - # "Data: Different array values" in log_msg - # for log_msg in catch.output - # ) - # ) - - # Test masked arrays - j = cf.Data(np.ma.masked_all(shape, dtype="int"), "m") with self.assertLogs(level=cf.log_level().value) as catch: - self.assertFalse(j.equals(d)) + self.assertFalse(h.equals(i)) self.assertTrue( any( "Data: Different array values" in log_msg From ea54b2c7fa4881db6b3f2597ac3aac20bec5d99e Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Tue, 31 Aug 2021 23:25:24 +0100 Subject: [PATCH 07/55] Complete Data.equals unit test by covering all valid parameters --- cf/test/test_Data.py | 62 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index b6af79c5ba..b47c67437b 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -212,6 +212,68 @@ def test_Data_equals(self): ) ) + # Test masked arrays + j = cf.Data(np.ma.masked_all(shape, dtype="int"), "m") + with self.assertLogs(level=cf.log_level().value) as catch: + self.assertFalse(j.equals(d)) + self.assertTrue( + any( + "Data: Different array values" in log_msg + for log_msg in catch.output + ) + ) + + # Test rtol and atol parameters + k1 = cf.Data(np.array([10.0, 20.0])) + k2 = cf.Data(np.array([10.01, 20.01])) + for ks in [(k1, k2), (k2, k1)]: # to test symmetry/commutativity + k_1, k_2 = ks + # Only one log check is sufficient here + with self.assertLogs(level=cf.log_level().value) as catch: + self.assertFalse(k_1.equals(k_2, atol=0.005, rtol=0)) + self.assertTrue( + any( + "Data: Different array values (atol=0.005, rtol=0)" + in log_msg + for log_msg in catch.output + ) + ) + self.assertTrue(k_1.equals(k_2, atol=0.02, rtol=0)) + self.assertFalse(k_1.equals(k_2, atol=0, rtol=0.0005)) + self.assertTrue(k_1.equals(k_2, atol=0, rtol=0.002)) + + # Test ignore_fill_value parameter + m1 = cf.Data(1, fill_value=1000) + m2 = cf.Data(1, fill_value=2000) + with self.assertLogs(level=cf.log_level().value) as catch: + self.assertFalse(m1.equals(m2)) + self.assertTrue( + any( + "Data: Different fill value: 1000 != 2000" in log_msg + for log_msg in catch.output + ) + ) + self.assertTrue(m1.equals(m2, ignore_fill_value=True)) + + # Test verbose parameter: 1/'INFO' level is behaviour change boundary + for checks in [(1, False), (2, True)]: + verbosity_level, expect_to_see_msg = checks + with self.assertLogs(level=cf.log_level().value) as catch: + self.assertFalse(d2.equals(d, verbose=verbosity_level)) + self.assertIs( + any( + "Data: Different data types: float32 != int64" + in log_msg + for log_msg in catch.output + ), + expect_to_see_msg, + ) + + # Test ignore_data_type parameter + # TODODASK - this one needs documenting in the method docstring. + # Question for DH: is ignore_type an alias? + self.assertTrue(d2.equals(d, ignore_data_type=True)) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "hits unexpected kwarg 'ndim'") def test_Data_halo(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: From bf3d58ab0d79b574c81d71cdcf708c55c72b371a Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Tue, 31 Aug 2021 23:28:06 +0100 Subject: [PATCH 08/55] Add missing closibng parenthesis in log message for differing Data units --- cf/data/data.py | 2 +- cf/test/test_Data.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index e90feefef0..a6e5e4a06d 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -9111,7 +9111,7 @@ def equals( other_Units = other.Units if self_Units != other_Units: logger.info( - "{}: Different Units ({!r}, {!r}".format( + "{}: Different Units ({!r}, {!r})".format( self.__class__.__name__, self.Units, other.Units ) ) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index b47c67437b..1e11cb082a 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -156,7 +156,7 @@ def test_Data_equals(self): self.assertFalse(e.equals(d)) self.assertTrue( any( - "Data: Different Units (, " in log_msg + "Data: Different Units (, )" in log_msg for log_msg in catch.output ) ) From d85b5cfd7a4ea86534e90cb834f40e23fc0ff1da Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Thu, 2 Sep 2021 19:41:29 +0100 Subject: [PATCH 09/55] Conversions from str.format() to f-strings --- cf/data/data.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index a6e5e4a06d..449e21277d 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -9111,9 +9111,8 @@ def equals( other_Units = other.Units if self_Units != other_Units: logger.info( - "{}: Different Units ({!r}, {!r})".format( - self.__class__.__name__, self.Units, other.Units - ) + f"{self.__class__.__name__}: Different Units " + f"({self.Units!r}, {other.Units!r})" ) return False @@ -9127,8 +9126,8 @@ def equals( self_dx, other_dx, rtol=float(rtol), atol=float(atol) ): logger.info( - "{0}: Different array values (atol={1}, " - "rtol={2})".format(self.__class__.__name__, atol, rtol) + f"{self.__class__.__name__}: Different array values " + f"(atol={atol}, rtol={rtol})" ) return False From ee6ed327fdb58814ef869076b36c05f0682390be Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Fri, 5 Nov 2021 17:09:50 +0000 Subject: [PATCH 10/55] New approach to equals() migration: define own da.ma.allclose --- cf/data/data.py | 40 ++++++++++++++++++++++++---------------- cf/data/utils.py | 35 +++++++++++++++++++++++++++++++++-- 2 files changed, 57 insertions(+), 18 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index 449e21277d..fb09dfbd48 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -107,6 +107,7 @@ from .partition import Partition from .partitionmatrix import PartitionMatrix from .utils import ( # is_small,; is_very_small, + _da_ma_allclose, conform_units, convert_to_datetime, convert_to_reftime, @@ -9116,26 +9117,33 @@ def equals( ) return False - # other.to_memory() # TODODASK is this still required? - self_dx = self._get_dask() other_dx = other._get_dask() - # Finally check that corresponding elements are equal to a tolerance - if not da.allclose( - self_dx, other_dx, rtol=float(rtol), atol=float(atol) - ): - logger.info( - f"{self.__class__.__name__}: Different array values " - f"(atol={atol}, rtol={rtol})" - ) - - return False + # Now check that corresponding elements are equal within a tolerance. + # We assume that all inputs are masked arrays. + mask_comparison = da.equal( + da.ma.getmaskarray(self_dx), da.ma.getmaskarray(other_dx) + ) + data_comparison = _da_ma_allclose( + self_dx, + other_dx, + masked_equal=False, + rtol=float(rtol), + atol=float(atol), + ) + # Apply a (dask) logical 'and' to confirm if both the mask and the + # data are equal for the pair of masked arrays: + result = da.all(da.logical_and(mask_comparison, data_comparison)) - # ------------------------------------------------------------ - # Still here? Then the two instances are equal. - # ------------------------------------------------------------ - return True + # Return the uncomputed result because there will often be further + # steps in the task graph and computing early is inefficient. + # + # Note that we don't supply an error message (for the False case) + # here because we can't specify when to run it without having + # evaluated by computing, but that's OK because dask will provide + # an appropriate error message to indicate any inequalities. + return result @_deprecated_kwarg_check("i") @_inplace_enabled(default=False) diff --git a/cf/data/utils.py b/cf/data/utils.py index da0499c24b..05afeb0aa2 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -10,6 +10,38 @@ from ..units import Units +def _da_ma_allclose(x, y, masked_equal=True, rtol=1e-05, atol=1e-08): + """An effective dask.array.ma.allclose method. + + True if two dask arrays are element-wise equal within + a tolerance. + + Equivalent to allclose except that masked values are treated + as equal (default) or unequal, depending on the masked_equal + argument. + + Define an effective da.ma.allclose method here because one is + currently missing in the Dask codebase. + + Note that all default arguments are the same as those provided to + the corresponding NumPy method (see the `numpy.ma.allclose` API + reference). + + TODODASK: put in a PR to Dask to request to add as genuine method. + + """ + x = da.asanyarray(x) + y = da.asanyarray(y) + return da.map_blocks( + np.ma.allclose, + x, + y, + masked_equal=masked_equal, + rtol=rtol, + atol=atol, + ) + + def convert_to_datetime(array, units): """Convert a daskarray to. @@ -415,7 +447,7 @@ def scalar_masked_array(dtype=float): a = np.ma.empty((), dtype=dtype) a.mask = True return a - + def conform_units(value, units): """Conform units. @@ -480,4 +512,3 @@ def conform_units(value, units): ) return value - From 6eafaf5197df5fab4457112fc8d1fd62129df78b Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Fri, 5 Nov 2021 17:52:07 +0000 Subject: [PATCH 11/55] Remove redundant tests on log messages; now test_Data_equals passes --- cf/test/test_Data.py | 113 +++++++++++++++++++++++-------------------- 1 file changed, 61 insertions(+), 52 deletions(-) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 1e11cb082a..134fe3b39a 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -174,12 +174,15 @@ def test_Data_equals(self): g = cf.Data(np.ones(shape, dtype="int64"), "m") # different values with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(g.equals(d)) - self.assertTrue( - any( - "Data: Different array values" in log_msg - for log_msg in catch.output - ) - ) + # TODODASK: (how) can we supply a post-compute log message such + # as the below, when we return an uncomputed result for the + # `equals` method? Leave these tests in until we know. + # self.assertTrue( + # any( + # "Data: Different array values" in log_msg + # for log_msg in catch.output + # ) + # ) # Test NaN and inf values h = cf.Data(np.full(shape, np.nan), "m") @@ -187,41 +190,44 @@ def test_Data_equals(self): with self.assertLogs(level=cf.log_level().value) as catch: # Compare to d3 not d since np.nan has dtype float64 (IEEE 754) self.assertFalse(h.equals(d3)) - print(catch.output) - self.assertTrue( - any( - "Data: Different array values" in log_msg - for log_msg in catch.output - ) - ) + # TODODASK: see TODO in previous block, don't uncomment below + # self.assertTrue( + # any( + # "Data: Different array values" in log_msg + # for log_msg in catch.output + # ) + # ) i = cf.Data(np.full(shape, np.inf), "m") with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(i.equals(d3)) # np.inf is also of dtype float64 - self.assertTrue( - any( - "Data: Different array values" in log_msg - for log_msg in catch.output - ) - ) + # TODODASK: see TODO in previous block, don't uncomment below + # self.assertTrue( + # any( + # "Data: Different array values" in log_msg + # for log_msg in catch.output + # ) + # ) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(h.equals(i)) - self.assertTrue( - any( - "Data: Different array values" in log_msg - for log_msg in catch.output - ) - ) + # TODODASK: see TODO in previous block, don't uncomment below + # self.assertTrue( + # any( + # "Data: Different array values" in log_msg + # for log_msg in catch.output + # ) + # ) # Test masked arrays j = cf.Data(np.ma.masked_all(shape, dtype="int"), "m") with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(j.equals(d)) - self.assertTrue( - any( - "Data: Different array values" in log_msg - for log_msg in catch.output - ) - ) + # TODODASK: see TODO in previous block, don't uncomment below + # self.assertTrue( + # any( + # "Data: Different array values" in log_msg + # for log_msg in catch.output + # ) + # ) # Test rtol and atol parameters k1 = cf.Data(np.array([10.0, 20.0])) @@ -231,13 +237,14 @@ def test_Data_equals(self): # Only one log check is sufficient here with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(k_1.equals(k_2, atol=0.005, rtol=0)) - self.assertTrue( - any( - "Data: Different array values (atol=0.005, rtol=0)" - in log_msg - for log_msg in catch.output - ) - ) + # TODODASK: see TODO in previous block, don't uncomment below + # self.assertTrue( + # any( + # "Data: Different array values (atol=0.005, rtol=0)" + # in log_msg + # for log_msg in catch.output + # ) + # ) self.assertTrue(k_1.equals(k_2, atol=0.02, rtol=0)) self.assertFalse(k_1.equals(k_2, atol=0, rtol=0.0005)) self.assertTrue(k_1.equals(k_2, atol=0, rtol=0.002)) @@ -247,12 +254,13 @@ def test_Data_equals(self): m2 = cf.Data(1, fill_value=2000) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(m1.equals(m2)) - self.assertTrue( - any( - "Data: Different fill value: 1000 != 2000" in log_msg - for log_msg in catch.output - ) - ) + # TODODASK: see TODO in previous block, don't uncomment below + # self.assertTrue( + # any( + # "Data: Different fill value: 1000 != 2000" in log_msg + # for log_msg in catch.output + # ) + # ) self.assertTrue(m1.equals(m2, ignore_fill_value=True)) # Test verbose parameter: 1/'INFO' level is behaviour change boundary @@ -260,14 +268,15 @@ def test_Data_equals(self): verbosity_level, expect_to_see_msg = checks with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(d2.equals(d, verbose=verbosity_level)) - self.assertIs( - any( - "Data: Different data types: float32 != int64" - in log_msg - for log_msg in catch.output - ), - expect_to_see_msg, - ) + # TODODASK: see TODO in previous block, don't uncomment below + # self.assertIs( + # any( + # "Data: Different data types: float32 != int64" + # in log_msg + # for log_msg in catch.output + # ), + # expect_to_see_msg, + # ) # Test ignore_data_type parameter # TODODASK - this one needs documenting in the method docstring. From 2f1657428743d6a5852a09de9ffd87751f830da6 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Mon, 8 Nov 2021 16:34:10 +0000 Subject: [PATCH 12/55] Flesh out docstring for new internal utility method _da_ma_allclose() --- cf/data/utils.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/cf/data/utils.py b/cf/data/utils.py index 05afeb0aa2..e7bb0633c6 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -29,6 +29,32 @@ def _da_ma_allclose(x, y, masked_equal=True, rtol=1e-05, atol=1e-08): TODODASK: put in a PR to Dask to request to add as genuine method. + .. versionadded:: 4.0.0 + + :Parameters: + + x: a dask array to compare with y + + y: a dask array to compare with x + + masked_equal: + Whether masked values in a and b are considered + equal (True) or not (False). They are considered equal + by default. + + rtol: + Relative tolerance. Default is 1e-05. + + atol: + Absolute tolerance. Default is 1e-08. + + :Returns: + + Boolean + A Boolean value indicating whether or not the + two dask arrays are element-wise equal to + the given rtol and atol tolerance. + """ x = da.asanyarray(x) y = da.asanyarray(y) From 22a0fee075a972b0028e5aaea67c794776031c7b Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Mon, 8 Nov 2021 16:38:03 +0000 Subject: [PATCH 13/55] Document undoc'd kwargs for Data.equals() method --- cf/data/data.py | 10 ++++++++-- cf/test/test_Data.py | 2 -- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index fb09dfbd48..386e47be8f 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -9059,19 +9059,25 @@ def equals( other: The object to compare for equality. - {{atol: number, optional}} - {{rtol: number, optional}} + {{atol: number, optional}} + ignore_fill_value: `bool`, optional If True then data arrays with different fill values are considered equal. By default they are considered unequal. + {{ignore_data_type: `bool`, optional}} + + {{ignore_type: `bool`, optional}} + {{verbose: `int` or `str` or `None`, optional}} traceback: deprecated at version 3.0.0 Use the *verbose* parameter instead. + {{ignore_compression: `bool`, optional}} + :Returns: `bool` diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 134fe3b39a..2cb8825e83 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -279,8 +279,6 @@ def test_Data_equals(self): # ) # Test ignore_data_type parameter - # TODODASK - this one needs documenting in the method docstring. - # Question for DH: is ignore_type an alias? self.assertTrue(d2.equals(d, ignore_data_type=True)) @unittest.skipIf(TEST_DASKIFIED_ONLY, "hits unexpected kwarg 'ndim'") From 309e91ddfd0b49fab0e2b654abd704d8ae0072f4 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Mon, 8 Nov 2021 16:42:44 +0000 Subject: [PATCH 14/55] Add note in Data.equals() providing & regarding old log message --- cf/data/data.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cf/data/data.py b/cf/data/data.py index 386e47be8f..2f9ea1329c 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -9142,6 +9142,16 @@ def equals( # data are equal for the pair of masked arrays: result = da.all(da.logical_and(mask_comparison, data_comparison)) + # TODODASK: (how) can we supply a post-compute log message such + # as the below, when we return an uncomputed result for the + # `equals` method so don't know if it is appropriate or not? + # Leave this (pre-Daskification message) commented here for now: + # + # logger.info( + # "{0}: Different array values (atol={1}, " + # "rtol={2})".format(self.__class__.__name__, atol, rtol) + # ) + # Return the uncomputed result because there will often be further # steps in the task graph and computing early is inefficient. # From 8a4005b403adc0d613a107756436343619aecbd0 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Tue, 9 Nov 2021 13:02:27 +0000 Subject: [PATCH 15/55] Potential alt. approach: pre-compute result inside Data.equals() --- cf/data/data.py | 25 +++------- cf/test/test_Data.py | 112 ++++++++++++++++++++----------------------- 2 files changed, 58 insertions(+), 79 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index 2f9ea1329c..c8cdc8c0ae 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -9142,24 +9142,13 @@ def equals( # data are equal for the pair of masked arrays: result = da.all(da.logical_and(mask_comparison, data_comparison)) - # TODODASK: (how) can we supply a post-compute log message such - # as the below, when we return an uncomputed result for the - # `equals` method so don't know if it is appropriate or not? - # Leave this (pre-Daskification message) commented here for now: - # - # logger.info( - # "{0}: Different array values (atol={1}, " - # "rtol={2})".format(self.__class__.__name__, atol, rtol) - # ) - - # Return the uncomputed result because there will often be further - # steps in the task graph and computing early is inefficient. - # - # Note that we don't supply an error message (for the False case) - # here because we can't specify when to run it without having - # evaluated by computing, but that's OK because dask will provide - # an appropriate error message to indicate any inequalities. - return result + if not result.compute(): + logger.info( + f"{self.__class__.__name__}: Different array values (" + f"atol={atol}, rtol={rtol})" + ) + else: + return True @_deprecated_kwarg_check("i") @_inplace_enabled(default=False) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 2cb8825e83..34a7b56bdb 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -174,15 +174,12 @@ def test_Data_equals(self): g = cf.Data(np.ones(shape, dtype="int64"), "m") # different values with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(g.equals(d)) - # TODODASK: (how) can we supply a post-compute log message such - # as the below, when we return an uncomputed result for the - # `equals` method? Leave these tests in until we know. - # self.assertTrue( - # any( - # "Data: Different array values" in log_msg - # for log_msg in catch.output - # ) - # ) + self.assertTrue( + any( + "Data: Different array values" in log_msg + for log_msg in catch.output + ) + ) # Test NaN and inf values h = cf.Data(np.full(shape, np.nan), "m") @@ -190,44 +187,40 @@ def test_Data_equals(self): with self.assertLogs(level=cf.log_level().value) as catch: # Compare to d3 not d since np.nan has dtype float64 (IEEE 754) self.assertFalse(h.equals(d3)) - # TODODASK: see TODO in previous block, don't uncomment below - # self.assertTrue( - # any( - # "Data: Different array values" in log_msg - # for log_msg in catch.output - # ) - # ) + self.assertTrue( + any( + "Data: Different array values" in log_msg + for log_msg in catch.output + ) + ) i = cf.Data(np.full(shape, np.inf), "m") with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(i.equals(d3)) # np.inf is also of dtype float64 - # TODODASK: see TODO in previous block, don't uncomment below - # self.assertTrue( - # any( - # "Data: Different array values" in log_msg - # for log_msg in catch.output - # ) - # ) + self.assertTrue( + any( + "Data: Different array values" in log_msg + for log_msg in catch.output + ) + ) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(h.equals(i)) - # TODODASK: see TODO in previous block, don't uncomment below - # self.assertTrue( - # any( - # "Data: Different array values" in log_msg - # for log_msg in catch.output - # ) - # ) + self.assertTrue( + any( + "Data: Different array values" in log_msg + for log_msg in catch.output + ) + ) # Test masked arrays j = cf.Data(np.ma.masked_all(shape, dtype="int"), "m") with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(j.equals(d)) - # TODODASK: see TODO in previous block, don't uncomment below - # self.assertTrue( - # any( - # "Data: Different array values" in log_msg - # for log_msg in catch.output - # ) - # ) + self.assertTrue( + any( + "Data: Different array values" in log_msg + for log_msg in catch.output + ) + ) # Test rtol and atol parameters k1 = cf.Data(np.array([10.0, 20.0])) @@ -237,14 +230,13 @@ def test_Data_equals(self): # Only one log check is sufficient here with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(k_1.equals(k_2, atol=0.005, rtol=0)) - # TODODASK: see TODO in previous block, don't uncomment below - # self.assertTrue( - # any( - # "Data: Different array values (atol=0.005, rtol=0)" - # in log_msg - # for log_msg in catch.output - # ) - # ) + self.assertTrue( + any( + "Data: Different array values (atol=0.005, rtol=0)" + in log_msg + for log_msg in catch.output + ) + ) self.assertTrue(k_1.equals(k_2, atol=0.02, rtol=0)) self.assertFalse(k_1.equals(k_2, atol=0, rtol=0.0005)) self.assertTrue(k_1.equals(k_2, atol=0, rtol=0.002)) @@ -254,13 +246,12 @@ def test_Data_equals(self): m2 = cf.Data(1, fill_value=2000) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(m1.equals(m2)) - # TODODASK: see TODO in previous block, don't uncomment below - # self.assertTrue( - # any( - # "Data: Different fill value: 1000 != 2000" in log_msg - # for log_msg in catch.output - # ) - # ) + self.assertTrue( + any( + "Data: Different fill value: 1000 != 2000" in log_msg + for log_msg in catch.output + ) + ) self.assertTrue(m1.equals(m2, ignore_fill_value=True)) # Test verbose parameter: 1/'INFO' level is behaviour change boundary @@ -268,15 +259,14 @@ def test_Data_equals(self): verbosity_level, expect_to_see_msg = checks with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(d2.equals(d, verbose=verbosity_level)) - # TODODASK: see TODO in previous block, don't uncomment below - # self.assertIs( - # any( - # "Data: Different data types: float32 != int64" - # in log_msg - # for log_msg in catch.output - # ), - # expect_to_see_msg, - # ) + self.assertIs( + any( + "Data: Different data types: float32 != int64" + in log_msg + for log_msg in catch.output + ), + expect_to_see_msg, + ) # Test ignore_data_type parameter self.assertTrue(d2.equals(d, ignore_data_type=True)) From 7a01cbe0f926f675713efc54eb763cbe29e4c15f Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Tue, 9 Nov 2021 13:34:03 +0000 Subject: [PATCH 16/55] Add developer notes on questions RE laziness of execution --- cf/data/README.rst | 76 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/cf/data/README.rst b/cf/data/README.rst index 6e404b720e..c746e5adf0 100644 --- a/cf/data/README.rst +++ b/cf/data/README.rst @@ -22,3 +22,79 @@ The mask hardness is most easily reset with the `cf.Data.__setitem__` and `cf.Data.where` are examples of methods that need to reset the mask in this manner. + + +Laziness +-------- + +To *be* lazy, or *not to be* lazy (in `cf.Data` itself)? +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Central to Dask is lazy execution i.e. delayed computation: +Dask operations essentially construct a graph of calculations +or transformations (etc.) that are ready to run later, +and only get evaluated together when requested with +a `.compute` call. + +We want to utilise this laziness because it is central to the +efficiency from using Dask, but to what extent to do we want +to incorporate laziness into `cf.Data`? Namely, for +an arbitary `cf.Data` method previously returning some result +(say, a Boolean or an array), which of these should we return: + +1. The **pre-computed result**, i.e. the outcome from running + `compute` on the result graph constructed in the method + (e.g. the same Boolean or an array, etc., as before); or +2. The **uncomputed result**, i.e. a Dask object which only + evaluates to the result in (1) when either the user or + the code under-the-hood, later, runs a `compute`? + +Arguments for choice (1) [advantages to (1) and disadvantages to (2)]: + +* The simpler choice: + * means output is the same as before so documentation is easier and + less change relative to previous versions; + * logging and error handling can remain simple and as-is, whereas + choice (2) would mean we don't know whether a given log or error + message, dependent on the outcome, is applicable, so we can't + call it immediately (perhaps at all?). We might have to defer to + standard Dask messages, which would reduce specificity and clarity. + * Testing will be simpler, as with (2) we would have to add `compute` + calls in at appropriate points before running test assertions, etc. + * Inspection methods can return as they do now, whereas with choice (2) + we would have to work out what to show when certain aspects aren't + yet computed. + +Arguments for choice (2): + +* The technically more complicated but more efficient choice, overall: + * This choice is more efficient when we build up chains of operations, + because it avoids intermediate computation meaning parallelisation can + be optimised more comprehensively by Dask. + +There may be at least one other option: + +3. Could we, temporariliy or permanently, make use of a common keyword + argument such as `precompute` on methods so users and under-the-hood in + the code we can dictate whether or not to return the pre-computed or + uncomputed result? That would give extra flexibility, but mean more + boilerplate code (which can be consolidated somewhat, but at best + will require some extra lines per method). + + If so, what would the best default be, `True` or `False`? + +I think we need to ensure we are consistent in our approach, so choose either +(1), (2) or (3) (or another alternative), rather than a mixture, as that +will be a maintenance nightmare! + + +Logging and error handling +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When Dask operations are uncomputed, we don't know whether certain logging +and error messages are applicable or not. + +Can we raise these in a delayed way, when we don't want to compute +early, in the case we are in the middle of under-the-hood operations and +also perhaps if we choose case (2) from the above points on extent of +laziness? How can it be done? From 73ebee7db1e1f7179f0080b37f8c1e58a3a3196c Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Tue, 9 Nov 2021 22:21:18 +0000 Subject: [PATCH 17/55] Update developer notes with further points from discussion --- cf/data/README.rst | 70 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/cf/data/README.rst b/cf/data/README.rst index c746e5adf0..e78ff6962c 100644 --- a/cf/data/README.rst +++ b/cf/data/README.rst @@ -52,6 +52,7 @@ an arbitary `cf.Data` method previously returning some result Arguments for choice (1) [advantages to (1) and disadvantages to (2)]: * The simpler choice: + * means output is the same as before so documentation is easier and less change relative to previous versions; * logging and error handling can remain simple and as-is, whereas @@ -68,6 +69,7 @@ Arguments for choice (1) [advantages to (1) and disadvantages to (2)]: Arguments for choice (2): * The technically more complicated but more efficient choice, overall: + * This choice is more efficient when we build up chains of operations, because it avoids intermediate computation meaning parallelisation can be optimised more comprehensively by Dask. @@ -97,4 +99,70 @@ and error messages are applicable or not. Can we raise these in a delayed way, when we don't want to compute early, in the case we are in the middle of under-the-hood operations and also perhaps if we choose case (2) from the above points on extent of -laziness? How can it be done? +laziness? How can it be done? Possible ideas include: + +* Using a `try/except` block whenever a custom error message is required, + catching the corresponding Dask errors and raising our own messages. + + +Masked arrays +------------- + +For methods such as `equals`, we need to consider whether an array is +a masked one, and if so, we need to consider the *masks* (e.g. whether they +are equal), as well as the *data* (equality or otherwise). + +But the difficulty is that some level of inspection, i.e. computation, is +required to know whether the object in question is masked or not! (This is +due to, fundamentally, the underlying netCDF or PP representation.) +And we want to avoid early computation, as again it is inefficient. + +Consider, for example, the case of a set of computations in which an +array may acquire a mask, or may not: until the `compute` is run, +we don't know whether there is a mask at the end. Note there is a +distinction here between a standard `array` and a `masked` array +which may have a trivial (say, all `False`) or non-trivial mask, e.g. +for Dask array cases (similarly for `np.ma` etc.): + +**Masked array, non-trivial mask:** + +.. code-block:: python + + >>> dx = da.from_array(np.ma.array([1, 2, 3], mask=[1, 0, 0])) + >>> dx + dask.array + +**Standard array, trivial i.e. all-Falsy mask:** + +.. code-block:: python + + >>> dy = da.from_array(np.ma.array([1, 2, 3], mask=[0, 0, 0])) + >>> dy + dask.array + +**Standard array i.e. no mask:** + +.. code-block:: python + + >>> dz = da.from_array(np.array([1, 2, 3])) + >>> dz + dask.array + + +After discussion, in order to resolve this issue, we proposed +tentatively that *we should make all arrays are of the masked variety, +i.e. `da.ma.masked_array` rather than `da.array`, so in the case of +an array that would otherwise be a standard (unmasked) one, it would +instead be a `da.ma.masked_array` with a fully Falsy mask. + +In practice this would mean that when we instantiate an object +directly from disk, we would edit the `_meta` attribute to +set it to masked. Though we need to evaluate the performance hit +of this to ensure it isn't significant. + + +Inheritance from `cfdm` +----------------------- + +Generally, how do we deal with optimisation for objects and logic inherited +from `cfdm`, since the current plan is not to Daskify `cfdm.Data`? From db3b871acf3390bf01f5ad8d5244c47e11e785e2 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Wed, 10 Nov 2021 14:25:51 +0000 Subject: [PATCH 18/55] Update _da_ma_allclose to use blockwise() in line with DH suggestion --- cf/data/utils.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/cf/data/utils.py b/cf/data/utils.py index e7bb0633c6..4438e3b170 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -56,15 +56,33 @@ def _da_ma_allclose(x, y, masked_equal=True, rtol=1e-05, atol=1e-08): the given rtol and atol tolerance. """ - x = da.asanyarray(x) - y = da.asanyarray(y) - return da.map_blocks( - np.ma.allclose, + + def allclose(a_blocks, b_blocks): + """Run `ma.allclose` across multiple blocks over two arrays.""" + result = True + for a, b in zip(a_blocks, b_blocks): + result &= np.ma.allclose( + a, b, rtol=rtol, atol=atol, masked_equal=masked_equal + ) + + return result + + # Handle scalars, which are not valid inputs to da.blockwise, though + # test for scalars by checking the shape to avoid computation, etc. + if not x.shape and not y.shape: # i.e. both are scalars + return np.ma.allclose(x, y) + elif not x.shape or not y.shape: + return False # one is a scalar and the other isn't => not all close + + axes = tuple(range(x.ndim)) + return da.blockwise( + allclose, + "", x, + axes, y, - masked_equal=masked_equal, - rtol=rtol, - atol=atol, + axes, + dtype=bool, ) From cbc67a5236c6db64ca42c3ccc16d16cf64415e1c Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Wed, 10 Nov 2021 14:43:50 +0000 Subject: [PATCH 19/55] Add checking on scalar inputs to test_Data_equals --- cf/data/utils.py | 9 ++++----- cf/test/test_Data.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/cf/data/utils.py b/cf/data/utils.py index 4438e3b170..19b8c4ad0f 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -67,12 +67,11 @@ def allclose(a_blocks, b_blocks): return result - # Handle scalars, which are not valid inputs to da.blockwise, though - # test for scalars by checking the shape to avoid computation, etc. - if not x.shape and not y.shape: # i.e. both are scalars + # Handle scalars: da.blockwise will raise a TypeError if both of its array + # inputs are scalar, though if only one is scalar it manages. Test for + # scalars by checking the shape (scalar has '()') to avoid computation. + if not x.shape and not y.shape: return np.ma.allclose(x, y) - elif not x.shape or not y.shape: - return False # one is a scalar and the other isn't => not all close axes = tuple(range(x.ndim)) return da.blockwise( diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 34a7b56bdb..ee765bfbcb 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -222,6 +222,37 @@ def test_Data_equals(self): ) ) + # Test where inputs are scalars + s1 = cf.Data(1) + s2 = cf.Data(10) + s3 = cf.Data("a_string") + # 1. both are scalars + with self.assertLogs(level=cf.log_level().value) as catch: + self.assertFalse(s1.equals(s2)) + self.assertTrue( + any( + "Data: Different array values" in log_msg + for log_msg in catch.output + ) + ) + with self.assertLogs(level=cf.log_level().value) as catch: + self.assertFalse(s1.equals(s3)) + self.assertTrue( + any( + "Data: Different data types: int64 != Date: Wed, 10 Nov 2021 19:37:18 +0000 Subject: [PATCH 20/55] Add test module for the Data utility functions, test_Data_utils --- cf/test/test_Data_utils.py | 80 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 cf/test/test_Data_utils.py diff --git a/cf/test/test_Data_utils.py b/cf/test/test_Data_utils.py new file mode 100644 index 0000000000..72cefe4bba --- /dev/null +++ b/cf/test/test_Data_utils.py @@ -0,0 +1,80 @@ +import datetime +import faulthandler +import unittest + +import dask.array as da +import numpy as np + +faulthandler.enable() # to debug seg faults and timeouts + +import cf + + +class DataUtilsTest(unittest.TestCase): + def test_Data_Utils__da_ma_allclose(self): + """TODO.""" + # Create a range of inputs to test against + a = np.ma.array([1.0, 2.0, 3.0], mask=[1, 0, 0]) + b = np.ma.array([1.0, 2.0, 3.0], mask=[0, 1, 0]) + c = np.ma.array([1.0, 2.0, 100.0], mask=[1, 0, 0]) + d = np.array([1.0, 2.0, 3.0]) + e = a + 5e-04 # outside of default tolerances + f = a + 5e-06 # within default tolerances + + # Test the function with these inputs as both numpy and dask arrays... + allclose = cf.data.utils._da_ma_allclose + da_ = da.from_array(a) + + self.assertTrue(allclose(a, a).compute()) + self.assertTrue(allclose(da_, da_).compute()) + + self.assertTrue(allclose(a, b).compute()) + self.assertTrue(allclose(da_, da.from_array(b)).compute()) + # ...including testing the 'masked_equal' parameter + self.assertFalse(allclose(a, b, masked_equal=False).compute()) + self.assertFalse( + allclose(da_, da.from_array(b), masked_equal=False).compute() + ) + + self.assertFalse(allclose(a, c).compute()) + self.assertFalse(allclose(da_, da.from_array(c)).compute()) + + self.assertTrue(allclose(a, d).compute()) + self.assertTrue(allclose(da_, da.from_array(d)).compute()) + + self.assertFalse(allclose(a, e).compute()) + self.assertFalse(allclose(da_, da.from_array(e)).compute()) + + self.assertTrue(allclose(a, f).compute()) + self.assertTrue(allclose(da_, da.from_array(f)).compute()) + + # Test when array inputs have different chunk sizes + da_ = da.from_array(a, chunks=(1, 2)) + self.assertTrue(allclose(da_, da.from_array(b, chunks=(3,))).compute()) + self.assertFalse( + allclose( + da_, da.from_array(b, chunks=(3,)), masked_equal=False + ).compute() + ) + self.assertFalse( + allclose(da_, da.from_array(c, chunks=(3,))).compute() + ) + + # Test the 'rtol' and 'atol' parameters: + self.assertFalse(allclose(a, e, rtol=1e-06).compute()) + self.assertFalse(allclose(da_, da.from_array(e), rtol=1e-06).compute()) + b1 = a / 10000 + b2 = e / 10000 + self.assertTrue(allclose(b1, b2, atol=1e-05).compute()) + self.assertTrue( + allclose( + da.from_array(b1), da.from_array(b2), atol=1e-05 + ).compute() + ) + + +if __name__ == "__main__": + print("Run date:", datetime.datetime.now()) + cf.environment() + print() + unittest.main(verbosity=2) From 845255b60e24580d46c781b573d9d9d225c228ad Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Thu, 11 Nov 2021 15:36:29 +0000 Subject: [PATCH 21/55] Some fixes to logic in _da_ma_allclose --- cf/data/data.py | 1 + cf/data/utils.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index c8cdc8c0ae..21a1387307 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -9147,6 +9147,7 @@ def equals( f"{self.__class__.__name__}: Different array values (" f"atol={atol}, rtol={rtol})" ) + return False else: return True diff --git a/cf/data/utils.py b/cf/data/utils.py index 19b8c4ad0f..f283ef0767 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -62,7 +62,11 @@ def allclose(a_blocks, b_blocks): result = True for a, b in zip(a_blocks, b_blocks): result &= np.ma.allclose( - a, b, rtol=rtol, atol=atol, masked_equal=masked_equal + a, + b, + masked_equal=masked_equal, + rtol=rtol, + atol=atol, ) return result @@ -71,7 +75,9 @@ def allclose(a_blocks, b_blocks): # inputs are scalar, though if only one is scalar it manages. Test for # scalars by checking the shape (scalar has '()') to avoid computation. if not x.shape and not y.shape: - return np.ma.allclose(x, y) + return np.ma.allclose( + x, y, masked_equal=masked_equal, rtol=rtol, atol=atol + ) axes = tuple(range(x.ndim)) return da.blockwise( From 2b73fadd7433193289efb8421120374b4bba318c Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Fri, 12 Nov 2021 18:00:06 +0000 Subject: [PATCH 22/55] Move _da_ma_allclose from data.utils to data.dask_utils --- cf/data/dask_utils.py | 82 ++++++++++++++++++++++++++++++++++++++ cf/data/data.py | 8 +++- cf/data/utils.py | 81 ------------------------------------- cf/test/test_Data_utils.py | 2 +- 4 files changed, 89 insertions(+), 84 deletions(-) diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index 7a6c4dcc07..0b02f5ac10 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -5,9 +5,91 @@ """ +import dask.array as da import numpy as np +def _da_ma_allclose(x, y, masked_equal=True, rtol=1e-05, atol=1e-08): + """An effective dask.array.ma.allclose method. + + True if two dask arrays are element-wise equal within + a tolerance. + + Equivalent to allclose except that masked values are treated + as equal (default) or unequal, depending on the masked_equal + argument. + + Define an effective da.ma.allclose method here because one is + currently missing in the Dask codebase. + + Note that all default arguments are the same as those provided to + the corresponding NumPy method (see the `numpy.ma.allclose` API + reference). + + TODODASK: put in a PR to Dask to request to add as genuine method. + + .. versionadded:: 4.0.0 + + :Parameters: + + x: a dask array to compare with y + + y: a dask array to compare with x + + masked_equal: + Whether masked values in a and b are considered + equal (True) or not (False). They are considered equal + by default. + + rtol: + Relative tolerance. Default is 1e-05. + + atol: + Absolute tolerance. Default is 1e-08. + + :Returns: + + Boolean + A Boolean value indicating whether or not the + two dask arrays are element-wise equal to + the given rtol and atol tolerance. + + """ + + def allclose(a_blocks, b_blocks): + """Run `ma.allclose` across multiple blocks over two arrays.""" + result = True + for a, b in zip(a_blocks, b_blocks): + result &= np.ma.allclose( + a, + b, + masked_equal=masked_equal, + rtol=rtol, + atol=atol, + ) + + return result + + # Handle scalars: da.blockwise will raise a TypeError if both of its array + # inputs are scalar, though if only one is scalar it manages. Test for + # scalars by checking the shape (scalar has '()') to avoid computation. + if not x.shape and not y.shape: + return np.ma.allclose( + x, y, masked_equal=masked_equal, rtol=rtol, atol=atol + ) + + axes = tuple(range(x.ndim)) + return da.blockwise( + allclose, + "", + x, + axes, + y, + axes, + dtype=bool, + ) + + def cf_harden_mask(a): """Harden the mask of a masked `numpy` array. diff --git a/cf/data/data.py b/cf/data/data.py index 21a1387307..ed5a26bf19 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -101,13 +101,17 @@ generate_axis_identifiers, to_dask, ) -from .dask_utils import cf_harden_mask, cf_soften_mask, cf_where +from .dask_utils import ( + _da_ma_allclose, + cf_harden_mask, + cf_soften_mask, + cf_where, +) from .filledarray import FilledArray from .mixin import DataClassDeprecationsMixin from .partition import Partition from .partitionmatrix import PartitionMatrix from .utils import ( # is_small,; is_very_small, - _da_ma_allclose, conform_units, convert_to_datetime, convert_to_reftime, diff --git a/cf/data/utils.py b/cf/data/utils.py index f283ef0767..429dfe5da6 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -10,87 +10,6 @@ from ..units import Units -def _da_ma_allclose(x, y, masked_equal=True, rtol=1e-05, atol=1e-08): - """An effective dask.array.ma.allclose method. - - True if two dask arrays are element-wise equal within - a tolerance. - - Equivalent to allclose except that masked values are treated - as equal (default) or unequal, depending on the masked_equal - argument. - - Define an effective da.ma.allclose method here because one is - currently missing in the Dask codebase. - - Note that all default arguments are the same as those provided to - the corresponding NumPy method (see the `numpy.ma.allclose` API - reference). - - TODODASK: put in a PR to Dask to request to add as genuine method. - - .. versionadded:: 4.0.0 - - :Parameters: - - x: a dask array to compare with y - - y: a dask array to compare with x - - masked_equal: - Whether masked values in a and b are considered - equal (True) or not (False). They are considered equal - by default. - - rtol: - Relative tolerance. Default is 1e-05. - - atol: - Absolute tolerance. Default is 1e-08. - - :Returns: - - Boolean - A Boolean value indicating whether or not the - two dask arrays are element-wise equal to - the given rtol and atol tolerance. - - """ - - def allclose(a_blocks, b_blocks): - """Run `ma.allclose` across multiple blocks over two arrays.""" - result = True - for a, b in zip(a_blocks, b_blocks): - result &= np.ma.allclose( - a, - b, - masked_equal=masked_equal, - rtol=rtol, - atol=atol, - ) - - return result - - # Handle scalars: da.blockwise will raise a TypeError if both of its array - # inputs are scalar, though if only one is scalar it manages. Test for - # scalars by checking the shape (scalar has '()') to avoid computation. - if not x.shape and not y.shape: - return np.ma.allclose( - x, y, masked_equal=masked_equal, rtol=rtol, atol=atol - ) - - axes = tuple(range(x.ndim)) - return da.blockwise( - allclose, - "", - x, - axes, - y, - axes, - dtype=bool, - ) - - def convert_to_datetime(array, units): """Convert a daskarray to. diff --git a/cf/test/test_Data_utils.py b/cf/test/test_Data_utils.py index 72cefe4bba..642f29eeae 100644 --- a/cf/test/test_Data_utils.py +++ b/cf/test/test_Data_utils.py @@ -22,7 +22,7 @@ def test_Data_Utils__da_ma_allclose(self): f = a + 5e-06 # within default tolerances # Test the function with these inputs as both numpy and dask arrays... - allclose = cf.data.utils._da_ma_allclose + allclose = cf.data.dask_utils._da_ma_allclose da_ = da.from_array(a) self.assertTrue(allclose(a, a).compute()) From 1cf03127485aebe25495de3164dfd6d8e366841b Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Fri, 12 Nov 2021 19:40:55 +0000 Subject: [PATCH 23/55] Improve handling of scalars and 0-d arrays in _da_ma_allclose --- cf/data/dask_utils.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index 0b02f5ac10..3a8555accf 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -59,6 +59,19 @@ def _da_ma_allclose(x, y, masked_equal=True, rtol=1e-05, atol=1e-08): def allclose(a_blocks, b_blocks): """Run `ma.allclose` across multiple blocks over two arrays.""" result = True + # Handle scalars, including 0-d arrays, for which a_blocks and + # b_blocks will have the corresponding type and hence not be iterable. + # With this approach, we avoid inspecting sizes or lengths, and for + # the 0-d array blocks the following iteration can be used unchanged + # and will only execute once with block sizes as desired of: + # (np.array(),)[0] = array(). Note + # can't check against more general case of collections.abc.Iterable + # because a 0-d array is also iterable, but in practice always a list. + if not isinstance(a_blocks, list): + a_blocks = (a_blocks,) + if not isinstance(b_blocks, list): + b_blocks = (b_blocks,) + for a, b in zip(a_blocks, b_blocks): result &= np.ma.allclose( a, @@ -70,14 +83,6 @@ def allclose(a_blocks, b_blocks): return result - # Handle scalars: da.blockwise will raise a TypeError if both of its array - # inputs are scalar, though if only one is scalar it manages. Test for - # scalars by checking the shape (scalar has '()') to avoid computation. - if not x.shape and not y.shape: - return np.ma.allclose( - x, y, masked_equal=masked_equal, rtol=rtol, atol=atol - ) - axes = tuple(range(x.ndim)) return da.blockwise( allclose, From 97478b28b8c91273df99bcf980c4d7553a4855ad Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Mon, 15 Nov 2021 17:32:15 +0000 Subject: [PATCH 24/55] Update _da_ma_allclose test to highlight difference w/ Data.equals --- cf/test/test_Data_utils.py | 51 +++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/cf/test/test_Data_utils.py b/cf/test/test_Data_utils.py index 642f29eeae..c811dc1582 100644 --- a/cf/test/test_Data_utils.py +++ b/cf/test/test_Data_utils.py @@ -13,8 +13,16 @@ class DataUtilsTest(unittest.TestCase): def test_Data_Utils__da_ma_allclose(self): """TODO.""" - # Create a range of inputs to test against + # Create a range of inputs to test against. + # Note that 'a' and 'a2' should be treated as 'allclose' for this + # method, the same result as np.ma.allclose would give because all + # of the *unmasked* elements are 'allclose', whereas in our + # Data.equals method that builds on this method, we go even further + # and insist on the mask being identical as well as the data + # (separately, i.e. unmasked) all being 'allclose', so inside our + # cf.Data objects 'a' and 'a2' would instead *not* be considered equal. a = np.ma.array([1.0, 2.0, 3.0], mask=[1, 0, 0]) + a2 = np.ma.array([10.0, 2.0, 3.0], mask=[1, 0, 0]) b = np.ma.array([1.0, 2.0, 3.0], mask=[0, 1, 0]) c = np.ma.array([1.0, 2.0, 100.0], mask=[1, 0, 0]) d = np.array([1.0, 2.0, 3.0]) @@ -28,43 +36,46 @@ def test_Data_Utils__da_ma_allclose(self): self.assertTrue(allclose(a, a).compute()) self.assertTrue(allclose(da_, da_).compute()) - self.assertTrue(allclose(a, b).compute()) - self.assertTrue(allclose(da_, da.from_array(b)).compute()) + self.assertTrue(allclose(a2, a).compute()) + self.assertTrue(allclose(da.from_array(a2), da_).compute()) + + self.assertTrue(allclose(b, a).compute()) + self.assertTrue(allclose(da.from_array(b), da_).compute()) # ...including testing the 'masked_equal' parameter - self.assertFalse(allclose(a, b, masked_equal=False).compute()) + self.assertFalse(allclose(b, a, masked_equal=False).compute()) self.assertFalse( - allclose(da_, da.from_array(b), masked_equal=False).compute() + allclose(da.from_array(b), da_, masked_equal=False).compute() ) - self.assertFalse(allclose(a, c).compute()) - self.assertFalse(allclose(da_, da.from_array(c)).compute()) + self.assertFalse(allclose(c, a).compute()) + self.assertFalse(allclose(da.from_array(c), da_).compute()) - self.assertTrue(allclose(a, d).compute()) - self.assertTrue(allclose(da_, da.from_array(d)).compute()) + self.assertTrue(allclose(d, a).compute()) + self.assertTrue(allclose(da.from_array(d), da_).compute()) - self.assertFalse(allclose(a, e).compute()) - self.assertFalse(allclose(da_, da.from_array(e)).compute()) + self.assertFalse(allclose(e, a).compute()) + self.assertFalse(allclose(da.from_array(e), da_).compute()) - self.assertTrue(allclose(a, f).compute()) - self.assertTrue(allclose(da_, da.from_array(f)).compute()) + self.assertTrue(allclose(f, a).compute()) + self.assertTrue(allclose(da.from_array(f), da_).compute()) # Test when array inputs have different chunk sizes da_ = da.from_array(a, chunks=(1, 2)) - self.assertTrue(allclose(da_, da.from_array(b, chunks=(3,))).compute()) + self.assertTrue(allclose(da.from_array(b, chunks=(3,)), da_).compute()) self.assertFalse( allclose( - da_, da.from_array(b, chunks=(3,)), masked_equal=False + da.from_array(b, chunks=(3,)), da_, masked_equal=False ).compute() ) self.assertFalse( - allclose(da_, da.from_array(c, chunks=(3,))).compute() + allclose(da.from_array(c, chunks=(3,)), da_).compute() ) # Test the 'rtol' and 'atol' parameters: - self.assertFalse(allclose(a, e, rtol=1e-06).compute()) - self.assertFalse(allclose(da_, da.from_array(e), rtol=1e-06).compute()) - b1 = a / 10000 - b2 = e / 10000 + self.assertFalse(allclose(e, a, rtol=1e-06).compute()) + self.assertFalse(allclose(da.from_array(e), da_, rtol=1e-06).compute()) + b1 = e / 10000 + b2 = a / 10000 self.assertTrue(allclose(b1, b2, atol=1e-05).compute()) self.assertTrue( allclose( From 8db307b65ea53a8427befb3af65b8b3fe75669fd Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Mon, 15 Nov 2021 18:26:33 +0000 Subject: [PATCH 25/55] Update masked array coverage in test_Data_equals --- cf/test/test_Data.py | 46 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index ee765bfbcb..7ddfea552d 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -212,9 +212,51 @@ def test_Data_equals(self): ) # Test masked arrays - j = cf.Data(np.ma.masked_all(shape, dtype="int"), "m") + # 1. Example case where the masks differ only (data is identical) + j1 = cf.Data(np.ma.array([1.0, 2.0, 3.0], mask=[1, 0, 0]), "m") + j2 = cf.Data(np.ma.array([1.0, 2.0, 3.0], mask=[0, 1, 0]), "m") with self.assertLogs(level=cf.log_level().value) as catch: - self.assertFalse(j.equals(d)) + self.assertFalse(j1.equals(j2)) + self.assertTrue( + any( + "Data: Different array values" in log_msg + for log_msg in catch.output + ) + ) + # 2. Example case where the data differs only (masks are identical) + j3 = cf.Data(np.ma.array([1.0, 2.0, 100.0], mask=[1, 0, 0]), "m") + with self.assertLogs(level=cf.log_level().value) as catch: + self.assertFalse(j1.equals(j3)) + self.assertTrue( + any( + "Data: Different array values" in log_msg + for log_msg in catch.output + ) + ) + + # 3. Trivial case of data that is fully masked + j4 = cf.Data(np.ma.masked_all(shape, dtype="int"), "m") + ### self.assertTrue(j4.equals(j4)) # check self-equality! + with self.assertLogs(level=cf.log_level().value) as catch: + self.assertFalse(j4.equals(d)) + self.assertTrue( + any( + "Data: Different array values" in log_msg + for log_msg in catch.output + ) + ) + # 4. Case where all the unmasked data is 'allclose' to other data but + # the data is not 'allclose' to it where it is masked, i.e. the data + # on its own (namely without considering the mask) is not equal to the + # other data on its own (e.g. note the 0-th element in below examples). + # This differs to case (2): there data differs *only where unmasked*. + # Note these should *not* be considered equal inside cf.Data, whereas + # np.ma.allclose and indeed our own _da_ma_allclose methods do hold + # these to be 'allclose': Data.equals is stricter than _da_ma_allclose. + j5 = cf.Data(np.ma.array([1.0, 2.0, 3.0], mask=[1, 0, 0]), "m") + j6 = cf.Data(np.ma.array([10.0, 2.0, 3.0], mask=[1, 0, 0]), "m") + with self.assertLogs(level=cf.log_level().value) as catch: + self.assertFalse(j5.equals(j6)) self.assertTrue( any( "Data: Different array values" in log_msg From 63161e612e80f478ea1a9eba44e9b10b15008ea6 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Tue, 16 Nov 2021 18:31:22 +0000 Subject: [PATCH 26/55] Add self-equality & masked string array checks to test_Data_equals --- cf/test/test_Data.py | 84 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 81 insertions(+), 3 deletions(-) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 7ddfea552d..3b208029a1 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -139,9 +139,10 @@ def test_Data_equals(self): a = np.arange(12).reshape(*shape) d = cf.Data(a, "m") - self.assertTrue(d.equals(d)) # trivial check + self.assertTrue(d.equals(d)) # also do self-equality checks! d2 = cf.Data(a.astype(np.float32), "m") # different datatype to d + self.assertTrue(d2.equals(d2)) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(d2.equals(d)) self.assertTrue( @@ -152,6 +153,7 @@ def test_Data_equals(self): ) e = cf.Data(a, "s") # different units to d + self.assertTrue(e.equals(e)) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(e.equals(d)) self.assertTrue( @@ -162,6 +164,7 @@ def test_Data_equals(self): ) f = cf.Data(np.arange(12), "m") # different shape to d + self.assertTrue(f.equals(f)) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(f.equals(d)) self.assertTrue( @@ -172,6 +175,7 @@ def test_Data_equals(self): ) g = cf.Data(np.ones(shape, dtype="int64"), "m") # different values + self.assertTrue(g.equals(g)) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(g.equals(d)) self.assertTrue( @@ -182,8 +186,11 @@ def test_Data_equals(self): ) # Test NaN and inf values - h = cf.Data(np.full(shape, np.nan), "m") d3 = cf.Data(a.astype(np.float64), "m") + h = cf.Data(np.full(shape, np.nan), "m") + # TODODASK: is this OK given that NaN in NumPy aren't equal to e/o + # or should this be assertTrue to expect equality behaviour? + self.assertFalse(h.equals(h)) with self.assertLogs(level=cf.log_level().value) as catch: # Compare to d3 not d since np.nan has dtype float64 (IEEE 754) self.assertFalse(h.equals(d3)) @@ -194,6 +201,7 @@ def test_Data_equals(self): ) ) i = cf.Data(np.full(shape, np.inf), "m") + self.assertTrue(i.equals(i)) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(i.equals(d3)) # np.inf is also of dtype float64 self.assertTrue( @@ -214,7 +222,11 @@ def test_Data_equals(self): # Test masked arrays # 1. Example case where the masks differ only (data is identical) j1 = cf.Data(np.ma.array([1.0, 2.0, 3.0], mask=[1, 0, 0]), "m") + # TODODASK: uncomment below, not working yet + ### self.assertTrue(j1.equals(j1)) j2 = cf.Data(np.ma.array([1.0, 2.0, 3.0], mask=[0, 1, 0]), "m") + # TODODASK: uncomment below, not working yet + ### self.assertTrue(j2.equals(j2)) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(j1.equals(j2)) self.assertTrue( @@ -225,6 +237,8 @@ def test_Data_equals(self): ) # 2. Example case where the data differs only (masks are identical) j3 = cf.Data(np.ma.array([1.0, 2.0, 100.0], mask=[1, 0, 0]), "m") + # TODODASK: uncomment below, not working yet + ### self.assertTrue(j3.equals(j3)) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(j1.equals(j3)) self.assertTrue( @@ -236,7 +250,8 @@ def test_Data_equals(self): # 3. Trivial case of data that is fully masked j4 = cf.Data(np.ma.masked_all(shape, dtype="int"), "m") - ### self.assertTrue(j4.equals(j4)) # check self-equality! + # TODODASK: uncomment below, not working yet + ### self.assertTrue(j4.equals(j4)) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(j4.equals(d)) self.assertTrue( @@ -254,7 +269,11 @@ def test_Data_equals(self): # np.ma.allclose and indeed our own _da_ma_allclose methods do hold # these to be 'allclose': Data.equals is stricter than _da_ma_allclose. j5 = cf.Data(np.ma.array([1.0, 2.0, 3.0], mask=[1, 0, 0]), "m") + # TODODASK: uncomment below, not working yet + ### self.assertTrue(j5.equals(j5)) j6 = cf.Data(np.ma.array([10.0, 2.0, 3.0], mask=[1, 0, 0]), "m") + # TODODASK: uncomment below, not working yet + ### self.assertTrue(j6.equals(j6)) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(j5.equals(j6)) self.assertTrue( @@ -264,10 +283,65 @@ def test_Data_equals(self): ) ) + # Test non-numeric dtype arrays + sa1 = cf.Data(np.array(["one", "two", "three"], dtype="S5"), "m") + # TODODASK: uncomment below, not working yet + ### self.assertTrue(sa1.equals(sa1)) + sa2_data = np.array(["one", "two", "four"], dtype="S4") + sa2 = cf.Data(sa2_data, "m") + # TODODASK: uncomment below, not working yet + ### self.assertTrue(sa2.equals(sa2)) + with self.assertLogs(level=cf.log_level().value) as catch: + self.assertFalse(sa1.equals(sa2)) + print(catch.output) + self.assertTrue( + any( + "Data: Different data types: |S5 != |S4" in log_msg + for log_msg in catch.output + ) + ) + sa3_data = sa2_data.astype("S5") + sa3 = cf.Data(sa3_data, "m") + # TODODASK: uncomment below, not working yet + # with self.assertLogs(level=cf.log_level().value) as catch: + # self.assertFalse(sa1.equals(sa3)) + # print(catch.output) + # self.assertTrue( + # any( + # "Data: Different array values" in log_msg + # for log_msg in catch.output + # ) + # ) + # ...including masked string arrays + sa4 = cf.Data(np.ma.array( + ["one", "two", "three"], mask=[0, 0, 1], dtype="S5", ), "m") + # TODODASK: uncomment below, not working yet + ### self.assertTrue(sa4.equals(sa4)) + sa5 = cf.Data(np.ma.array( + ["one", "two", "three"], mask=[0, 1, 0], dtype="S5", ), "m") + # TODODASK: uncomment below, not working yet + ### self.assertTrue(sa5.equals(sa5)) + + # Test where inputs are scalars + # TODODASK: uncomment below, not working yet + # with self.assertLogs(level=cf.log_level().value) as catch: + # self.assertFalse(sa4.equals(sa5)) + # print(catch.output) + # self.assertTrue( + # any( + # "Data: Different array values" in log_msg + # for log_msg in catch.output + # ) + # ) + # Test where inputs are scalars s1 = cf.Data(1) + self.assertTrue(s1.equals(s1)) s2 = cf.Data(10) + self.assertTrue(s2.equals(s2)) s3 = cf.Data("a_string") + # TODODASK: uncomment below, not working yet + ### self.assertTrue(s3.equals(s3)) # 1. both are scalars with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(s1.equals(s2)) @@ -297,7 +371,9 @@ def test_Data_equals(self): # Test rtol and atol parameters k1 = cf.Data(np.array([10.0, 20.0])) + self.assertTrue(k1.equals(k1)) k2 = cf.Data(np.array([10.01, 20.01])) + self.assertTrue(k2.equals(k2)) for ks in [(k1, k2), (k2, k1)]: # to test symmetry/commutativity k_1, k_2 = ks # Only one log check is sufficient here @@ -316,7 +392,9 @@ def test_Data_equals(self): # Test ignore_fill_value parameter m1 = cf.Data(1, fill_value=1000) + self.assertTrue(m1.equals(m1)) m2 = cf.Data(1, fill_value=2000) + self.assertTrue(m2.equals(m2)) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(m1.equals(m2)) self.assertTrue( From b78551b0d5074857c79fbe3edf1d027824f84a66 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Wed, 17 Nov 2021 19:31:25 +0000 Subject: [PATCH 27/55] _da_ma_allclose: set kwargs in inner function w/ note explaining why --- cf/data/dask_utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index 3a8555accf..9890c29686 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -56,7 +56,10 @@ def _da_ma_allclose(x, y, masked_equal=True, rtol=1e-05, atol=1e-08): """ - def allclose(a_blocks, b_blocks): + # Must pass rtol=rtol, atol=atol in as kwargs to allclose, rather than it + # using those in local scope from the outer function arguments, because + # Dask's internal algorithms require these to be set as parameters. + def allclose(a_blocks, b_blocks, rtol=rtol, atol=atol): """Run `ma.allclose` across multiple blocks over two arrays.""" result = True # Handle scalars, including 0-d arrays, for which a_blocks and @@ -92,6 +95,8 @@ def allclose(a_blocks, b_blocks): y, axes, dtype=bool, + rtol=rtol, + atol=atol, ) From 153dee086080665503fa3f730a32688c66fc25cf Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Wed, 17 Nov 2021 19:43:47 +0000 Subject: [PATCH 28/55] Update developer notes with agreed approach to cf.Data method laziness --- cf/data/README.rst | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/cf/data/README.rst b/cf/data/README.rst index e78ff6962c..08e04fcf9f 100644 --- a/cf/data/README.rst +++ b/cf/data/README.rst @@ -74,20 +74,34 @@ Arguments for choice (2): because it avoids intermediate computation meaning parallelisation can be optimised more comprehensively by Dask. -There may be at least one other option: +As well as choice (1) or (2) outright, there are further options for +a mixture or a flexible choice of return object in this respect: -3. Could we, temporariliy or permanently, make use of a common keyword - argument such as `precompute` on methods so users and under-the-hood in +3. Make use of a common keyword argument such as `precompute` + on methods so users and under-the-hood in the code we can dictate whether or not to return the pre-computed or uncomputed result? That would give extra flexibility, but mean more boilerplate code (which can be consolidated somewhat, but at best will require some extra lines per method). - If so, what would the best default be, `True` or `False`? + If this option is chosen, what would the best default be, `True` + or `False`? -I think we need to ensure we are consistent in our approach, so choose either -(1), (2) or (3) (or another alternative), rather than a mixture, as that -will be a maintenance nightmare! +4. (DH's suggestion) Methods that return new cf.Data objects + (such as transpose) should be lazy and other methods should not be + (e.g. __repr__ and equals). + +**We have agreed that (4) is the most sensible approach to take, therefore +the working plan is** that: + +* **any method (previously) returning a cf.Data object will, + post-daskification, belazy and return the uncomputed result**, i.e. a + Dask object that, when computed, will evaluate to the final cf.Data + object (e.g. if computed immediately after the method runs, the result + would be the same cf.Data object as that previously returned); but +* **any method returning another object, such as a Boolean or a string + representation of the object, will not be lazy and + return the pre-computed object as before**. Logging and error handling @@ -150,7 +164,7 @@ for Dask array cases (similarly for `np.ma` etc.): After discussion, in order to resolve this issue, we proposed -tentatively that *we should make all arrays are of the masked variety, +tentatively that *we should ensure all arrays are of the masked variety*, i.e. `da.ma.masked_array` rather than `da.array`, so in the case of an array that would otherwise be a standard (unmasked) one, it would instead be a `da.ma.masked_array` with a fully Falsy mask. From 71f11106ce4063bb23b658c0f701b283fc703224 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Wed, 17 Nov 2021 19:59:06 +0000 Subject: [PATCH 29/55] Re-organise developer notes to tidy --- cf/data/README.rst | 117 +++++++++++++++++++++++---------------------- 1 file changed, 60 insertions(+), 57 deletions(-) diff --git a/cf/data/README.rst b/cf/data/README.rst index 08e04fcf9f..e57755e68b 100644 --- a/cf/data/README.rst +++ b/cf/data/README.rst @@ -1,8 +1,67 @@ `cf.Data` developer notes ========================= +Masked arrays +------------- + +Whether there is a mask or not +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For methods such as `equals`, we need to consider whether an array is +a masked one, and if so, we need to consider the *masks* (e.g. whether they +are equal), as well as the *data* (equality or otherwise). + +But the difficulty is that some level of inspection, i.e. computation, is +required to know whether the object in question is masked or not! (This is +due to, fundamentally, the underlying netCDF or PP representation.) +And we want to avoid early computation, as again it is inefficient. + +Consider, for example, the case of a set of computations in which an +array may acquire a mask, or may not: until the `compute` is run, +we don't know whether there is a mask at the end. Note there is a +distinction here between a standard `array` and a `masked` array +which may have a trivial (say, all `False`) or non-trivial mask, e.g. +for Dask array cases (similarly for `np.ma` etc.): + +**Masked array, non-trivial mask:** + +.. code-block:: python + + >>> dx = da.from_array(np.ma.array([1, 2, 3], mask=[1, 0, 0])) + >>> dx + dask.array + +**Standard array, trivial i.e. all-Falsy mask:** + +.. code-block:: python + + >>> dy = da.from_array(np.ma.array([1, 2, 3], mask=[0, 0, 0])) + >>> dy + dask.array + +**Standard array i.e. no mask:** + +.. code-block:: python + + >>> dz = da.from_array(np.array([1, 2, 3])) + >>> dz + dask.array + + +After discussion, in order to resolve this issue, we proposed +tentatively that *we should ensure all arrays are of the masked variety*, +i.e. `da.ma.masked_array` rather than `da.array`, so in the case of +an array that would otherwise be a standard (unmasked) one, it would +instead be a `da.ma.masked_array` with a fully Falsy mask. + +In practice this would mean that when we instantiate an object +directly from disk, we would edit the `_meta` attribute to +set it to masked. Though we need to evaluate the performance hit +of this to ensure it isn't significant. + + Hardness of the mask --------------------- +^^^^^^^^^^^^^^^^^^^^ Any `cf.Data` method that changes the dask array should consider whether or not the mask hardness needs resetting before @@ -119,62 +178,6 @@ laziness? How can it be done? Possible ideas include: catching the corresponding Dask errors and raising our own messages. -Masked arrays -------------- - -For methods such as `equals`, we need to consider whether an array is -a masked one, and if so, we need to consider the *masks* (e.g. whether they -are equal), as well as the *data* (equality or otherwise). - -But the difficulty is that some level of inspection, i.e. computation, is -required to know whether the object in question is masked or not! (This is -due to, fundamentally, the underlying netCDF or PP representation.) -And we want to avoid early computation, as again it is inefficient. - -Consider, for example, the case of a set of computations in which an -array may acquire a mask, or may not: until the `compute` is run, -we don't know whether there is a mask at the end. Note there is a -distinction here between a standard `array` and a `masked` array -which may have a trivial (say, all `False`) or non-trivial mask, e.g. -for Dask array cases (similarly for `np.ma` etc.): - -**Masked array, non-trivial mask:** - -.. code-block:: python - - >>> dx = da.from_array(np.ma.array([1, 2, 3], mask=[1, 0, 0])) - >>> dx - dask.array - -**Standard array, trivial i.e. all-Falsy mask:** - -.. code-block:: python - - >>> dy = da.from_array(np.ma.array([1, 2, 3], mask=[0, 0, 0])) - >>> dy - dask.array - -**Standard array i.e. no mask:** - -.. code-block:: python - - >>> dz = da.from_array(np.array([1, 2, 3])) - >>> dz - dask.array - - -After discussion, in order to resolve this issue, we proposed -tentatively that *we should ensure all arrays are of the masked variety*, -i.e. `da.ma.masked_array` rather than `da.array`, so in the case of -an array that would otherwise be a standard (unmasked) one, it would -instead be a `da.ma.masked_array` with a fully Falsy mask. - -In practice this would mean that when we instantiate an object -directly from disk, we would edit the `_meta` attribute to -set it to masked. Though we need to evaluate the performance hit -of this to ensure it isn't significant. - - Inheritance from `cfdm` ----------------------- From 69a8d332666195c60d9c7fa48780322d200f9823 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Thu, 18 Nov 2021 19:07:43 +0000 Subject: [PATCH 30/55] Add utility function to check if an array is of numeric dtype --- cf/data/utils.py | 50 ++++++++++++++++++++++++++++++++++++++ cf/test/test_Data_utils.py | 17 +++++++++++++ 2 files changed, 67 insertions(+) diff --git a/cf/data/utils.py b/cf/data/utils.py index 429dfe5da6..e48b4d3b6a 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -10,6 +10,56 @@ from ..units import Units +def _is_numeric_dtype(array): + """True if the given array is of a numeric dtype. + + .. versionadded:: 4.0.0 + + :Parameters: + + array: numpy-like array + + :Returns: + + `bool` + Whether or not the array holds numeric elements. + + **Examples:** + + >>> a = np.array([0, 1, 2]) + >>> _is_numeric_dtype(a) + True + >>> a = np.array([False, True, True]) + >>> _is_numeric_dtype(a) + True + >>> a = np.array(["a", "b", "c"], dtype="S1") + >>> _is_numeric_dtype(a) + False + >>> a = np.ma.array([10.0, 2.0, 3.0], mask=[1, 0, 0]) + >>> _is_numeric_dtype(a) + True + >>> a = np.array(10) + >>> _is_numeric_dtype(a) + True + >>> a = np.empty(1, dtype=object) + >>> _is_numeric_dtype(a) + False + + """ + # TODODASK: do we need to make any specific checks relating to ways of + # encoding datetimes, which could be encoded as strings, e.g. as in + # "2000-12-3 12:00", yet could be considered, or encoded as, numeric? + dtype = array.dtype + # This checks if the dtype is either a standard "numeric" type (i.e. + # int types, floating point types or complex floating point types) + # or Boolean, which are effectively a restricted int type (0 or 1). + # We determine the former by seeing if it sits under the 'np.number' + # top-level dtype in the NumPy dtype hierarchy; see the + # 'Hierarchy of type objects' figure diagram under: + # https://numpy.org/doc/stable/reference/arrays.scalars.html#scalars + return np.issubdtype(dtype, np.number) or np.issubdtype(dtype, np.bool_) + + def convert_to_datetime(array, units): """Convert a daskarray to. diff --git a/cf/test/test_Data_utils.py b/cf/test/test_Data_utils.py index c811dc1582..8f8fb2a59f 100644 --- a/cf/test/test_Data_utils.py +++ b/cf/test/test_Data_utils.py @@ -83,6 +83,23 @@ def test_Data_Utils__da_ma_allclose(self): ).compute() ) + def test_Data_Utils__is_numeric_dtype(self): + """TODO.""" + _is_numeric_dtype = cf.data.utils._is_numeric_dtype + for a in [ + np.array([0, 1, 2]), + np.array([False, True, True]), + np.ma.array([10.0, 2.0, 3.0], mask=[1, 0, 0]), + np.array(10), + ]: + self.assertTrue(_is_numeric_dtype(a)) + + for b in [ + np.array(["a", "b", "c"], dtype="S1"), + np.empty(1, dtype=object), + ]: + self.assertFalse(_is_numeric_dtype(b)) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) From d98feca159a7e0cd7f57e2bb1debe5b7a05ece19 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Thu, 18 Nov 2021 19:40:35 +0000 Subject: [PATCH 31/55] Data.equals: use appropriate data comparison for non-numeric dtypes --- cf/data/data.py | 34 ++++++++++++++------ cf/test/test_Data.py | 74 ++++++++++++++++++++++---------------------- 2 files changed, 62 insertions(+), 46 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index ed5a26bf19..8fa1cb265f 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -112,6 +112,7 @@ from .partition import Partition from .partitionmatrix import PartitionMatrix from .utils import ( # is_small,; is_very_small, + _is_numeric_dtype, conform_units, convert_to_datetime, convert_to_reftime, @@ -9131,20 +9132,35 @@ def equals( other_dx = other._get_dask() # Now check that corresponding elements are equal within a tolerance. - # We assume that all inputs are masked arrays. + # We assume that all inputs are masked arrays. Note we compare the + # data first as this may return False due to different dtype without + # having to wait until the compute call. + self_is_numeric = _is_numeric_dtype(self_dx) + other_is_numeric = _is_numeric_dtype(other_dx) + if self_is_numeric and other_is_numeric: + data_comparison = _da_ma_allclose( + self_dx, + other_dx, + masked_equal=False, + rtol=float(rtol), + atol=float(atol), + ) + elif not self_is_numeric and not other_is_numeric: + data_comparison = (self_dx == other_dx).all() + else: # one is numeric and other isn't => not equal (incompat. dtype) + logger.info( + f"{self.__class__.__name__}: Different data types:" + f"{self_dx.dtype} != {other_dx.dtype}" + ) + return False + mask_comparison = da.equal( da.ma.getmaskarray(self_dx), da.ma.getmaskarray(other_dx) ) - data_comparison = _da_ma_allclose( - self_dx, - other_dx, - masked_equal=False, - rtol=float(rtol), - atol=float(atol), - ) + # Apply a (dask) logical 'and' to confirm if both the mask and the # data are equal for the pair of masked arrays: - result = da.all(da.logical_and(mask_comparison, data_comparison)) + result = da.all(da.logical_and(data_comparison, mask_comparison)) if not result.compute(): logger.info( diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 3b208029a1..cf5a917f4e 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -285,15 +285,12 @@ def test_Data_equals(self): # Test non-numeric dtype arrays sa1 = cf.Data(np.array(["one", "two", "three"], dtype="S5"), "m") - # TODODASK: uncomment below, not working yet - ### self.assertTrue(sa1.equals(sa1)) + self.assertTrue(sa1.equals(sa1)) sa2_data = np.array(["one", "two", "four"], dtype="S4") sa2 = cf.Data(sa2_data, "m") - # TODODASK: uncomment below, not working yet - ### self.assertTrue(sa2.equals(sa2)) + self.assertTrue(sa2.equals(sa2)) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(sa1.equals(sa2)) - print(catch.output) self.assertTrue( any( "Data: Different data types: |S5 != |S4" in log_msg @@ -302,37 +299,41 @@ def test_Data_equals(self): ) sa3_data = sa2_data.astype("S5") sa3 = cf.Data(sa3_data, "m") - # TODODASK: uncomment below, not working yet - # with self.assertLogs(level=cf.log_level().value) as catch: - # self.assertFalse(sa1.equals(sa3)) - # print(catch.output) - # self.assertTrue( - # any( - # "Data: Different array values" in log_msg - # for log_msg in catch.output - # ) - # ) + with self.assertLogs(level=cf.log_level().value) as catch: + self.assertFalse(sa1.equals(sa3)) + self.assertTrue( + any( + "Data: Different array values" in log_msg + for log_msg in catch.output + ) + ) # ...including masked string arrays - sa4 = cf.Data(np.ma.array( - ["one", "two", "three"], mask=[0, 0, 1], dtype="S5", ), "m") - # TODODASK: uncomment below, not working yet - ### self.assertTrue(sa4.equals(sa4)) - sa5 = cf.Data(np.ma.array( - ["one", "two", "three"], mask=[0, 1, 0], dtype="S5", ), "m") - # TODODASK: uncomment below, not working yet - ### self.assertTrue(sa5.equals(sa5)) - - # Test where inputs are scalars - # TODODASK: uncomment below, not working yet - # with self.assertLogs(level=cf.log_level().value) as catch: - # self.assertFalse(sa4.equals(sa5)) - # print(catch.output) - # self.assertTrue( - # any( - # "Data: Different array values" in log_msg - # for log_msg in catch.output - # ) - # ) + sa4 = cf.Data( + np.ma.array( + ["one", "two", "three"], + mask=[0, 0, 1], + dtype="S5", + ), + "m", + ) + self.assertTrue(sa4.equals(sa4)) + sa5 = cf.Data( + np.ma.array( + ["one", "two", "three"], + mask=[0, 1, 0], + dtype="S5", + ), + "m", + ) + self.assertTrue(sa5.equals(sa5)) + with self.assertLogs(level=cf.log_level().value) as catch: + self.assertFalse(sa4.equals(sa5)) + self.assertTrue( + any( + "Data: Different array values" in log_msg + for log_msg in catch.output + ) + ) # Test where inputs are scalars s1 = cf.Data(1) @@ -340,8 +341,7 @@ def test_Data_equals(self): s2 = cf.Data(10) self.assertTrue(s2.equals(s2)) s3 = cf.Data("a_string") - # TODODASK: uncomment below, not working yet - ### self.assertTrue(s3.equals(s3)) + self.assertTrue(s3.equals(s3)) # 1. both are scalars with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(s1.equals(s2)) From e14f24683a95d60dd9b81ad2f4cb6b5e36fb2879 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Fri, 19 Nov 2021 14:42:50 +0000 Subject: [PATCH 32/55] Debugging of data comparison: add print statements to illustrate --- cf/data/data.py | 14 +++++++++----- cf/test/test_Data.py | 20 ++++++++------------ 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index 8fa1cb265f..d3c37e29a1 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -9138,15 +9138,17 @@ def equals( self_is_numeric = _is_numeric_dtype(self_dx) other_is_numeric = _is_numeric_dtype(other_dx) if self_is_numeric and other_is_numeric: + print("DATA COMP'ING IS", self_dx.compute(), other_dx.compute()) data_comparison = _da_ma_allclose( self_dx, other_dx, - masked_equal=False, + masked_equal=False, # TODODASK: is this correct, or want True? rtol=float(rtol), atol=float(atol), ) + print("DATA COMP RESULT IS", data_comparison.compute()) elif not self_is_numeric and not other_is_numeric: - data_comparison = (self_dx == other_dx).all() + data_comparison = da.all(self_dx == other_dx) else: # one is numeric and other isn't => not equal (incompat. dtype) logger.info( f"{self.__class__.__name__}: Different data types:" @@ -9154,13 +9156,15 @@ def equals( ) return False - mask_comparison = da.equal( - da.ma.getmaskarray(self_dx), da.ma.getmaskarray(other_dx) + mask_comparison = da.all( + da.equal(da.ma.getmaskarray(self_dx), da.ma.getmaskarray(other_dx)) ) + print("MASK COMP RESULT IS", mask_comparison.compute()) # Apply a (dask) logical 'and' to confirm if both the mask and the # data are equal for the pair of masked arrays: - result = da.all(da.logical_and(data_comparison, mask_comparison)) + result = da.logical_and(data_comparison, mask_comparison) + print("OVERALL COMP RESULT IS", result.compute()) if not result.compute(): logger.info( diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index cf5a917f4e..113583ea54 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -222,11 +222,10 @@ def test_Data_equals(self): # Test masked arrays # 1. Example case where the masks differ only (data is identical) j1 = cf.Data(np.ma.array([1.0, 2.0, 3.0], mask=[1, 0, 0]), "m") - # TODODASK: uncomment below, not working yet - ### self.assertTrue(j1.equals(j1)) + print("SELF-EQUALS J1 NOW <<<<<<<<<<<<<<<<<<<<<<<<<") + self.assertTrue(j1.equals(j1)) j2 = cf.Data(np.ma.array([1.0, 2.0, 3.0], mask=[0, 1, 0]), "m") - # TODODASK: uncomment below, not working yet - ### self.assertTrue(j2.equals(j2)) + self.assertTrue(j2.equals(j2)) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(j1.equals(j2)) self.assertTrue( @@ -237,8 +236,7 @@ def test_Data_equals(self): ) # 2. Example case where the data differs only (masks are identical) j3 = cf.Data(np.ma.array([1.0, 2.0, 100.0], mask=[1, 0, 0]), "m") - # TODODASK: uncomment below, not working yet - ### self.assertTrue(j3.equals(j3)) + self.assertTrue(j3.equals(j3)) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(j1.equals(j3)) self.assertTrue( @@ -250,8 +248,7 @@ def test_Data_equals(self): # 3. Trivial case of data that is fully masked j4 = cf.Data(np.ma.masked_all(shape, dtype="int"), "m") - # TODODASK: uncomment below, not working yet - ### self.assertTrue(j4.equals(j4)) + self.assertTrue(j4.equals(j4)) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(j4.equals(d)) self.assertTrue( @@ -269,12 +266,11 @@ def test_Data_equals(self): # np.ma.allclose and indeed our own _da_ma_allclose methods do hold # these to be 'allclose': Data.equals is stricter than _da_ma_allclose. j5 = cf.Data(np.ma.array([1.0, 2.0, 3.0], mask=[1, 0, 0]), "m") - # TODODASK: uncomment below, not working yet - ### self.assertTrue(j5.equals(j5)) + self.assertTrue(j5.equals(j5)) j6 = cf.Data(np.ma.array([10.0, 2.0, 3.0], mask=[1, 0, 0]), "m") - # TODODASK: uncomment below, not working yet - ### self.assertTrue(j6.equals(j6)) + self.assertTrue(j6.equals(j6)) with self.assertLogs(level=cf.log_level().value) as catch: + print("J5 EQUALS J6 NOW <<<<<<<<<<<<<<<<<<<<<<<<<") self.assertFalse(j5.equals(j6)) self.assertTrue( any( From 66e7a053ad36b6143127c8f9fa6adcbf299f52f5 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Wed, 24 Nov 2021 17:26:29 +0000 Subject: [PATCH 33/55] Update development notes for cf.Data RE arrays that may be masked --- cf/data/README.rst | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/cf/data/README.rst b/cf/data/README.rst index e57755e68b..557101109d 100644 --- a/cf/data/README.rst +++ b/cf/data/README.rst @@ -23,7 +23,7 @@ distinction here between a standard `array` and a `masked` array which may have a trivial (say, all `False`) or non-trivial mask, e.g. for Dask array cases (similarly for `np.ma` etc.): -**Masked array, non-trivial mask:** +**Masked array with a non-trivial mask:** .. code-block:: python @@ -31,7 +31,7 @@ for Dask array cases (similarly for `np.ma` etc.): >>> dx dask.array -**Standard array, trivial i.e. all-Falsy mask:** +**Masked array with a trivial i.e. all-Falsy mask:** .. code-block:: python @@ -48,16 +48,27 @@ for Dask array cases (similarly for `np.ma` etc.): dask.array -After discussion, in order to resolve this issue, we proposed -tentatively that *we should ensure all arrays are of the masked variety*, -i.e. `da.ma.masked_array` rather than `da.array`, so in the case of -an array that would otherwise be a standard (unmasked) one, it would -instead be a `da.ma.masked_array` with a fully Falsy mask. +Solution +######## -In practice this would mean that when we instantiate an object -directly from disk, we would edit the `_meta` attribute to -set it to masked. Though we need to evaluate the performance hit -of this to ensure it isn't significant. +To work around the complication of not being able to know whether an array +is a masked one or not in any cases of computation where a mask may be +added, we will, for all these cases, use the fact that standard arrays (i.e. +example 3 above) can also be queried with `da.ma.getmaskarray`, returning +an all-False mask (just like a masked array with an all-False mask, i.e. +example 2 above, would): + +.. code-block:: python + + >>> dz = da.from_array(np.array([1, 2, 3])) # i.e. example 3 above + >>> mz = da.ma.getmaskarray(dz) + >>> mz.compute() + array([False, False, False]) + + >>> dy = da.from_array(np.ma.array([1, 2, 3], mask=[0, 0, 0])) # i.e. example 2 + >>> my = da.ma.getmaskarray(dy) + >>> my.compute() + array([False, False, False]) Hardness of the mask From 3e1aa67a881b54faa3ef3b4c3a11ef57cf7e2d75 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Wed, 24 Nov 2021 18:02:09 +0000 Subject: [PATCH 34/55] Remove maximum version spec. for cfdm for ease of development --- cf/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cf/__init__.py b/cf/__init__.py index 25f8ca8446..854b5ecd8f 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -195,11 +195,11 @@ # Check the version of cfdm _minimum_vn = "1.8.9.0" -_maximum_vn = "1.8.10.0" +# TODODASK: reinstate maximum version spec., removed for now to simplify dev. _cfdm_version = LooseVersion(cfdm.__version__) -if not LooseVersion(_minimum_vn) <= _cfdm_version < LooseVersion(_maximum_vn): +if not LooseVersion(_minimum_vn) <= _cfdm_version: raise RuntimeError( - f"Bad cfdm version: cf requires {_minimum_vn}<=cfdm<{_maximum_vn}. " + f"Bad cfdm version: cf requires {_minimum_vn}. " f"Got {_cfdm_version} at {cfdm.__file__}" ) From 3053ccb6b81a5174c12c101b44911302ebe838ce Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Wed, 24 Nov 2021 18:07:36 +0000 Subject: [PATCH 35/55] CI: add test_Data_utils to Dask testing workflow --- .github/workflows/dask-migration-testing.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/dask-migration-testing.yml b/.github/workflows/dask-migration-testing.yml index 45528f82a4..f1a6007d75 100644 --- a/.github/workflows/dask-migration-testing.yml +++ b/.github/workflows/dask-migration-testing.yml @@ -100,13 +100,20 @@ jobs: - name: Notify about starting testing run: echo Setup complete. Now starting to run the cf-python test suite... - # Finally run test_Data.py! + # Finally run the relevant tests: firstly test_Data.py... - name: Run the test_Data test module shell: bash -l {0} run: | cd ${{ github.workspace }}/main/cf/test python test_Data.py + # ... and finally test_Data_utils.py. + - name: Run the test_Data test module + shell: bash -l {0} + run: | + cd ${{ github.workspace }}/main/cf/test + python test_Data_utils.py + # End with a message indicating the suite has completed its run - name: Notify about a completed run run: | From c227e496d6687e055966c77213b9546093794d19 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Mon, 10 Jan 2022 14:10:30 +0000 Subject: [PATCH 36/55] Address DH feedback by using cf defaults for rtol & atol values --- cf/data/dask_utils.py | 15 ++++++++++----- cf/test/test_Data_utils.py | 8 ++++---- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index 9890c29686..52f1481255 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -8,8 +8,11 @@ import dask.array as da import numpy as np +from ..functions import atol as cf_atol +from ..functions import rtol as cf_rtol -def _da_ma_allclose(x, y, masked_equal=True, rtol=1e-05, atol=1e-08): + +def _da_ma_allclose(x, y, masked_equal=True, rtol=None, atol=None): """An effective dask.array.ma.allclose method. True if two dask arrays are element-wise equal within @@ -41,11 +44,9 @@ def _da_ma_allclose(x, y, masked_equal=True, rtol=1e-05, atol=1e-08): equal (True) or not (False). They are considered equal by default. - rtol: - Relative tolerance. Default is 1e-05. + {{rtol: number, optional}} - atol: - Absolute tolerance. Default is 1e-08. + {{atol: number, optional}} :Returns: @@ -55,6 +56,10 @@ def _da_ma_allclose(x, y, masked_equal=True, rtol=1e-05, atol=1e-08): the given rtol and atol tolerance. """ + if rtol is None: + rtol = cf_rtol() + if atol is None: + atol = cf_atol() # Must pass rtol=rtol, atol=atol in as kwargs to allclose, rather than it # using those in local scope from the outer function arguments, because diff --git a/cf/test/test_Data_utils.py b/cf/test/test_Data_utils.py index 8f8fb2a59f..788a75a29b 100644 --- a/cf/test/test_Data_utils.py +++ b/cf/test/test_Data_utils.py @@ -26,8 +26,8 @@ def test_Data_Utils__da_ma_allclose(self): b = np.ma.array([1.0, 2.0, 3.0], mask=[0, 1, 0]) c = np.ma.array([1.0, 2.0, 100.0], mask=[1, 0, 0]) d = np.array([1.0, 2.0, 3.0]) - e = a + 5e-04 # outside of default tolerances - f = a + 5e-06 # within default tolerances + e = a + 5e-04 # outside of tolerance to set, namely rtol=1e-05 + f = a + 5e-06 # within set tolerance to be specified, as above # Test the function with these inputs as both numpy and dask arrays... allclose = cf.data.dask_utils._da_ma_allclose @@ -56,8 +56,8 @@ def test_Data_Utils__da_ma_allclose(self): self.assertFalse(allclose(e, a).compute()) self.assertFalse(allclose(da.from_array(e), da_).compute()) - self.assertTrue(allclose(f, a).compute()) - self.assertTrue(allclose(da.from_array(f), da_).compute()) + self.assertTrue(allclose(f, a, rtol=1e-05).compute()) + self.assertTrue(allclose(da.from_array(f), da_, rtol=1e-05).compute()) # Test when array inputs have different chunk sizes da_ = da.from_array(a, chunks=(1, 2)) From f1094d3f0d2b22d5a2fa5802471133535c64cc8f Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Mon, 10 Jan 2022 16:11:15 +0000 Subject: [PATCH 37/55] Update self-equality tests in test_Data_equals to use copies --- cf/test/test_Data.py | 49 ++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 113583ea54..654c978706 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -139,10 +139,10 @@ def test_Data_equals(self): a = np.arange(12).reshape(*shape) d = cf.Data(a, "m") - self.assertTrue(d.equals(d)) # also do self-equality checks! + self.assertTrue(d.equals(d.copy())) # also do self-equality checks! d2 = cf.Data(a.astype(np.float32), "m") # different datatype to d - self.assertTrue(d2.equals(d2)) + self.assertTrue(d2.equals(d2.copy())) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(d2.equals(d)) self.assertTrue( @@ -153,7 +153,7 @@ def test_Data_equals(self): ) e = cf.Data(a, "s") # different units to d - self.assertTrue(e.equals(e)) + self.assertTrue(e.equals(e.copy())) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(e.equals(d)) self.assertTrue( @@ -164,7 +164,7 @@ def test_Data_equals(self): ) f = cf.Data(np.arange(12), "m") # different shape to d - self.assertTrue(f.equals(f)) + self.assertTrue(f.equals(f.copy())) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(f.equals(d)) self.assertTrue( @@ -175,7 +175,7 @@ def test_Data_equals(self): ) g = cf.Data(np.ones(shape, dtype="int64"), "m") # different values - self.assertTrue(g.equals(g)) + self.assertTrue(g.equals(g.copy())) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(g.equals(d)) self.assertTrue( @@ -190,7 +190,7 @@ def test_Data_equals(self): h = cf.Data(np.full(shape, np.nan), "m") # TODODASK: is this OK given that NaN in NumPy aren't equal to e/o # or should this be assertTrue to expect equality behaviour? - self.assertFalse(h.equals(h)) + self.assertFalse(h.equals(h.copy())) with self.assertLogs(level=cf.log_level().value) as catch: # Compare to d3 not d since np.nan has dtype float64 (IEEE 754) self.assertFalse(h.equals(d3)) @@ -201,7 +201,7 @@ def test_Data_equals(self): ) ) i = cf.Data(np.full(shape, np.inf), "m") - self.assertTrue(i.equals(i)) + self.assertTrue(i.equals(i.copy())) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(i.equals(d3)) # np.inf is also of dtype float64 self.assertTrue( @@ -223,9 +223,9 @@ def test_Data_equals(self): # 1. Example case where the masks differ only (data is identical) j1 = cf.Data(np.ma.array([1.0, 2.0, 3.0], mask=[1, 0, 0]), "m") print("SELF-EQUALS J1 NOW <<<<<<<<<<<<<<<<<<<<<<<<<") - self.assertTrue(j1.equals(j1)) + self.assertTrue(j1.equals(j1.copy())) j2 = cf.Data(np.ma.array([1.0, 2.0, 3.0], mask=[0, 1, 0]), "m") - self.assertTrue(j2.equals(j2)) + self.assertTrue(j2.equals(j2.copy())) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(j1.equals(j2)) self.assertTrue( @@ -236,7 +236,7 @@ def test_Data_equals(self): ) # 2. Example case where the data differs only (masks are identical) j3 = cf.Data(np.ma.array([1.0, 2.0, 100.0], mask=[1, 0, 0]), "m") - self.assertTrue(j3.equals(j3)) + self.assertTrue(j3.equals(j3.copy())) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(j1.equals(j3)) self.assertTrue( @@ -248,7 +248,7 @@ def test_Data_equals(self): # 3. Trivial case of data that is fully masked j4 = cf.Data(np.ma.masked_all(shape, dtype="int"), "m") - self.assertTrue(j4.equals(j4)) + self.assertTrue(j4.equals(j4.copy())) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(j4.equals(d)) self.assertTrue( @@ -266,9 +266,9 @@ def test_Data_equals(self): # np.ma.allclose and indeed our own _da_ma_allclose methods do hold # these to be 'allclose': Data.equals is stricter than _da_ma_allclose. j5 = cf.Data(np.ma.array([1.0, 2.0, 3.0], mask=[1, 0, 0]), "m") - self.assertTrue(j5.equals(j5)) + self.assertTrue(j5.equals(j5.copy())) j6 = cf.Data(np.ma.array([10.0, 2.0, 3.0], mask=[1, 0, 0]), "m") - self.assertTrue(j6.equals(j6)) + self.assertTrue(j6.equals(j6.copy())) with self.assertLogs(level=cf.log_level().value) as catch: print("J5 EQUALS J6 NOW <<<<<<<<<<<<<<<<<<<<<<<<<") self.assertFalse(j5.equals(j6)) @@ -281,10 +281,10 @@ def test_Data_equals(self): # Test non-numeric dtype arrays sa1 = cf.Data(np.array(["one", "two", "three"], dtype="S5"), "m") - self.assertTrue(sa1.equals(sa1)) + self.assertTrue(sa1.equals(sa1.copy())) sa2_data = np.array(["one", "two", "four"], dtype="S4") sa2 = cf.Data(sa2_data, "m") - self.assertTrue(sa2.equals(sa2)) + self.assertTrue(sa2.equals(sa2.copy())) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(sa1.equals(sa2)) self.assertTrue( @@ -295,6 +295,7 @@ def test_Data_equals(self): ) sa3_data = sa2_data.astype("S5") sa3 = cf.Data(sa3_data, "m") + self.assertTrue(sa3.equals(sa3.copy())) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(sa1.equals(sa3)) self.assertTrue( @@ -312,7 +313,7 @@ def test_Data_equals(self): ), "m", ) - self.assertTrue(sa4.equals(sa4)) + self.assertTrue(sa4.equals(sa4.copy())) sa5 = cf.Data( np.ma.array( ["one", "two", "three"], @@ -321,7 +322,7 @@ def test_Data_equals(self): ), "m", ) - self.assertTrue(sa5.equals(sa5)) + self.assertTrue(sa5.equals(sa5.copy())) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(sa4.equals(sa5)) self.assertTrue( @@ -333,11 +334,11 @@ def test_Data_equals(self): # Test where inputs are scalars s1 = cf.Data(1) - self.assertTrue(s1.equals(s1)) + self.assertTrue(s1.equals(s1.copy())) s2 = cf.Data(10) - self.assertTrue(s2.equals(s2)) + self.assertTrue(s2.equals(s2.copy())) s3 = cf.Data("a_string") - self.assertTrue(s3.equals(s3)) + self.assertTrue(s3.equals(s3.copy())) # 1. both are scalars with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(s1.equals(s2)) @@ -367,9 +368,9 @@ def test_Data_equals(self): # Test rtol and atol parameters k1 = cf.Data(np.array([10.0, 20.0])) - self.assertTrue(k1.equals(k1)) + self.assertTrue(k1.equals(k1.copy())) k2 = cf.Data(np.array([10.01, 20.01])) - self.assertTrue(k2.equals(k2)) + self.assertTrue(k2.equals(k2.copy())) for ks in [(k1, k2), (k2, k1)]: # to test symmetry/commutativity k_1, k_2 = ks # Only one log check is sufficient here @@ -388,9 +389,9 @@ def test_Data_equals(self): # Test ignore_fill_value parameter m1 = cf.Data(1, fill_value=1000) - self.assertTrue(m1.equals(m1)) + self.assertTrue(m1.equals(m1.copy())) m2 = cf.Data(1, fill_value=2000) - self.assertTrue(m2.equals(m2)) + self.assertTrue(m2.equals(m2.copy())) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(m1.equals(m2)) self.assertTrue( From 1d2a674e4320f72379eddd92766fa578ea2750fc Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Tue, 11 Jan 2022 16:55:10 +0000 Subject: [PATCH 38/55] Address DH feedback: update expected test result for special case --- cf/test/test_Data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 654c978706..c46189025a 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -262,16 +262,16 @@ def test_Data_equals(self): # on its own (namely without considering the mask) is not equal to the # other data on its own (e.g. note the 0-th element in below examples). # This differs to case (2): there data differs *only where unmasked*. - # Note these should *not* be considered equal inside cf.Data, whereas - # np.ma.allclose and indeed our own _da_ma_allclose methods do hold - # these to be 'allclose': Data.equals is stricter than _da_ma_allclose. + # Note these *should* be considered equal inside cf.Data, and indeed + # np.ma.allclose and our own _da_ma_allclose methods also hold + # these to be 'allclose'. j5 = cf.Data(np.ma.array([1.0, 2.0, 3.0], mask=[1, 0, 0]), "m") self.assertTrue(j5.equals(j5.copy())) j6 = cf.Data(np.ma.array([10.0, 2.0, 3.0], mask=[1, 0, 0]), "m") self.assertTrue(j6.equals(j6.copy())) with self.assertLogs(level=cf.log_level().value) as catch: print("J5 EQUALS J6 NOW <<<<<<<<<<<<<<<<<<<<<<<<<") - self.assertFalse(j5.equals(j6)) + self.assertTrue(j5.equals(j6)) self.assertTrue( any( "Data: Different array values" in log_msg From b09a57a7debcbd58604bab55993ef130c74e79a3 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Wed, 12 Jan 2022 16:29:23 +0000 Subject: [PATCH 39/55] Indicate tests failing due to inconsistent cfdm Data.equals --- cf/data/data.py | 6 ++++++ cf/test/test_Data.py | 20 +++++++++++--------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index d3c37e29a1..adf78271e5 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -9113,6 +9113,12 @@ def equals( ignore_type=ignore_type, _check_values=False, ): + print( + "TODO DASK: using False result from cfdm Data.equals as a " + "short-circuit BUT at present this may actually be a False " + "negative (i.e. result may actually be True) since logic " + "there has not yet been made consistent with cf Data.equals." + ) # TODODASK return False # ------------------------------------------------------------ diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index c46189025a..8d5ad18bac 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -141,16 +141,18 @@ def test_Data_equals(self): d = cf.Data(a, "m") self.assertTrue(d.equals(d.copy())) # also do self-equality checks! - d2 = cf.Data(a.astype(np.float32), "m") # different datatype to d + # Different but equivalent datatype, so expect equality to pass + d2 = cf.Data(a.astype(np.float32), "m") self.assertTrue(d2.equals(d2.copy())) - with self.assertLogs(level=cf.log_level().value) as catch: - self.assertFalse(d2.equals(d)) - self.assertTrue( - any( - "Data: Different data types: float32 != int64" in log_msg - for log_msg in catch.output - ) - ) + # TODODASK: fails due to inconsistency in cfdm Data.equals, to fix + # with self.assertLogs(level=cf.log_level().value) as catch: + # self.assertTrue(d2.equals(d)) + # self.assertTrue( + # any( + # "Data: Different data types: float32 != int64" in log_msg + # for log_msg in catch.output + # ) + # ) e = cf.Data(a, "s") # different units to d self.assertTrue(e.equals(e.copy())) From f71353fbad4dca2417b5ab32ad64f9eec96736a7 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Wed, 12 Jan 2022 16:44:55 +0000 Subject: [PATCH 40/55] Set masked_equal=True inside Data.equals to fix self-equality tests --- cf/data/data.py | 2 +- cf/test/test_Data.py | 10 +--------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index adf78271e5..fcbf36604b 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -9148,7 +9148,7 @@ def equals( data_comparison = _da_ma_allclose( self_dx, other_dx, - masked_equal=False, # TODODASK: is this correct, or want True? + masked_equal=True, rtol=float(rtol), atol=float(atol), ) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 8d5ad18bac..c6439c8553 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -271,15 +271,7 @@ def test_Data_equals(self): self.assertTrue(j5.equals(j5.copy())) j6 = cf.Data(np.ma.array([10.0, 2.0, 3.0], mask=[1, 0, 0]), "m") self.assertTrue(j6.equals(j6.copy())) - with self.assertLogs(level=cf.log_level().value) as catch: - print("J5 EQUALS J6 NOW <<<<<<<<<<<<<<<<<<<<<<<<<") - self.assertTrue(j5.equals(j6)) - self.assertTrue( - any( - "Data: Different array values" in log_msg - for log_msg in catch.output - ) - ) + self.assertTrue(j5.equals(j6)) # Test non-numeric dtype arrays sa1 = cf.Data(np.array(["one", "two", "three"], dtype="S5"), "m") From d3d51985e5f8d26ae2d546190bf16607c643f808 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Wed, 12 Jan 2022 22:01:26 +0000 Subject: [PATCH 41/55] Tidy PR 254: remove all dev. aid print statements --- cf/data/data.py | 9 +++------ cf/test/test_Data.py | 4 ++-- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index ae60a9c6f0..11f25f840f 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -9118,8 +9118,9 @@ def equals( _check_values=False, ): print( - "TODO DASK: using False result from cfdm Data.equals as a " - "short-circuit BUT at present this may actually be a False " + "LAMA->DASK WARNING: False result from cfdm Data.equals " + f"short-circuiting cf Data.equals on {self!r} and {other!r} " + "BUT at present this may actually be a False " "negative (i.e. result may actually be True) since logic " "there has not yet been made consistent with cf Data.equals." ) # TODODASK @@ -9148,7 +9149,6 @@ def equals( self_is_numeric = _is_numeric_dtype(self_dx) other_is_numeric = _is_numeric_dtype(other_dx) if self_is_numeric and other_is_numeric: - print("DATA COMP'ING IS", self_dx.compute(), other_dx.compute()) data_comparison = _da_ma_allclose( self_dx, other_dx, @@ -9156,7 +9156,6 @@ def equals( rtol=float(rtol), atol=float(atol), ) - print("DATA COMP RESULT IS", data_comparison.compute()) elif not self_is_numeric and not other_is_numeric: data_comparison = da.all(self_dx == other_dx) else: # one is numeric and other isn't => not equal (incompat. dtype) @@ -9169,12 +9168,10 @@ def equals( mask_comparison = da.all( da.equal(da.ma.getmaskarray(self_dx), da.ma.getmaskarray(other_dx)) ) - print("MASK COMP RESULT IS", mask_comparison.compute()) # Apply a (dask) logical 'and' to confirm if both the mask and the # data are equal for the pair of masked arrays: result = da.logical_and(data_comparison, mask_comparison) - print("OVERALL COMP RESULT IS", result.compute()) if not result.compute(): logger.info( diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 49db9bae50..fdb27bb871 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -144,7 +144,8 @@ def test_Data_equals(self): # Different but equivalent datatype, so expect equality to pass d2 = cf.Data(a.astype(np.float32), "m") self.assertTrue(d2.equals(d2.copy())) - # TODODASK: fails due to inconsistency in cfdm Data.equals, to fix + # TODODASK: fails due to inconsistency in cfdm Data.equals, to be + # fixed (see 'LAMA->DASK WARNING' in Data.equals method). # with self.assertLogs(level=cf.log_level().value) as catch: # self.assertTrue(d2.equals(d)) # self.assertTrue( @@ -224,7 +225,6 @@ def test_Data_equals(self): # Test masked arrays # 1. Example case where the masks differ only (data is identical) j1 = cf.Data(np.ma.array([1.0, 2.0, 3.0], mask=[1, 0, 0]), "m") - print("SELF-EQUALS J1 NOW <<<<<<<<<<<<<<<<<<<<<<<<<") self.assertTrue(j1.equals(j1.copy())) j2 = cf.Data(np.ma.array([1.0, 2.0, 3.0], mask=[0, 1, 0]), "m") self.assertTrue(j2.equals(j2.copy())) From 2f14b3e5cd117c17ea1c3e7f39390cec204e6d8d Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Thu, 13 Jan 2022 21:46:53 +0000 Subject: [PATCH 42/55] Update Data.equals to check for strict datatype equality --- cf/test/test_Data.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index fdb27bb871..523b2785ba 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -141,19 +141,19 @@ def test_Data_equals(self): d = cf.Data(a, "m") self.assertTrue(d.equals(d.copy())) # also do self-equality checks! - # Different but equivalent datatype, so expect equality to pass + # Different but equivalent datatype, which should *fail* the equality + # test (i.e. equals return False) because we want equals to check + # for strict equality, including equality of data type. d2 = cf.Data(a.astype(np.float32), "m") self.assertTrue(d2.equals(d2.copy())) - # TODODASK: fails due to inconsistency in cfdm Data.equals, to be - # fixed (see 'LAMA->DASK WARNING' in Data.equals method). - # with self.assertLogs(level=cf.log_level().value) as catch: - # self.assertTrue(d2.equals(d)) - # self.assertTrue( - # any( - # "Data: Different data types: float32 != int64" in log_msg - # for log_msg in catch.output - # ) - # ) + with self.assertLogs(level=cf.log_level().value) as catch: + self.assertFalse(d2.equals(d)) + self.assertTrue( + any( + "Data: Different data types: float32 != int64" in log_msg + for log_msg in catch.output + ) + ) e = cf.Data(a, "s") # different units to d self.assertTrue(e.equals(e.copy())) From e2960617d24edda764e75557d255632c7e339c2f Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Mon, 17 Jan 2022 19:12:13 +0000 Subject: [PATCH 43/55] Comments in preparation for eventual equal_nan kwarg to Data.equals --- cf/data/data.py | 9 ++------- cf/test/test_Data.py | 7 ++++--- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index 11f25f840f..140de50fc0 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -9117,13 +9117,8 @@ def equals( ignore_type=ignore_type, _check_values=False, ): - print( - "LAMA->DASK WARNING: False result from cfdm Data.equals " - f"short-circuiting cf Data.equals on {self!r} and {other!r} " - "BUT at present this may actually be a False " - "negative (i.e. result may actually be True) since logic " - "there has not yet been made consistent with cf Data.equals." - ) # TODODASK + # TODODASK: consistency with cfdm Data.equals needs to be verified + # possibly via a follow-up PR to cfdm to implement any changes. return False # ------------------------------------------------------------ diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 523b2785ba..b6e7d3458d 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -188,11 +188,10 @@ def test_Data_equals(self): ) ) - # Test NaN and inf values + # Test NaN values d3 = cf.Data(a.astype(np.float64), "m") h = cf.Data(np.full(shape, np.nan), "m") - # TODODASK: is this OK given that NaN in NumPy aren't equal to e/o - # or should this be assertTrue to expect equality behaviour? + # TODODASK: implement and test equal_nan kwarg to configure NaN eq. self.assertFalse(h.equals(h.copy())) with self.assertLogs(level=cf.log_level().value) as catch: # Compare to d3 not d since np.nan has dtype float64 (IEEE 754) @@ -203,6 +202,8 @@ def test_Data_equals(self): for log_msg in catch.output ) ) + + # Test inf values i = cf.Data(np.full(shape, np.inf), "m") self.assertTrue(i.equals(i.copy())) with self.assertLogs(level=cf.log_level().value) as catch: From 5d6838aa48e55c8721b456ee181516bb5e6ee32e Mon Sep 17 00:00:00 2001 From: "Sadie L. Bartholomew" Date: Tue, 18 Jan 2022 13:38:44 +0000 Subject: [PATCH 44/55] Update cf/data/utils.py: more accurate docstring Co-authored-by: David Hassell --- cf/data/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/utils.py b/cf/data/utils.py index e48b4d3b6a..3d3c935db9 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -11,7 +11,7 @@ def _is_numeric_dtype(array): - """True if the given array is of a numeric dtype. + """True if the given array is of a numeric or boolean data type. .. versionadded:: 4.0.0 From 201a91f2bffc8ad3dbfc849294db9278fdebeb81 Mon Sep 17 00:00:00 2001 From: "Sadie L. Bartholomew" Date: Tue, 18 Jan 2022 14:26:42 +0000 Subject: [PATCH 45/55] Update cf/data/dask_utils.py: improve docstring formatting Co-authored-by: David Hassell --- cf/data/dask_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index 81b6850b07..42c984de4b 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -53,7 +53,7 @@ def _da_ma_allclose(x, y, masked_equal=True, rtol=None, atol=None): Boolean A Boolean value indicating whether or not the two dask arrays are element-wise equal to - the given rtol and atol tolerance. + the given *rtol* and *atol* tolerance. """ if rtol is None: From 981ad20c566156c5fff955efad189347b5950262 Mon Sep 17 00:00:00 2001 From: "Sadie L. Bartholomew" Date: Tue, 18 Jan 2022 14:33:58 +0000 Subject: [PATCH 46/55] Update cf/test/test_Data.py: test equality to self (not a copy) Co-authored-by: David Hassell --- cf/test/test_Data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index b6e7d3458d..382bad1b4a 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -139,6 +139,7 @@ def test_Data_equals(self): a = np.arange(12).reshape(*shape) d = cf.Data(a, "m") + self.assertTrue(d.equals(d)) # check equal to self self.assertTrue(d.equals(d.copy())) # also do self-equality checks! # Different but equivalent datatype, which should *fail* the equality From bc9a434b513c1f2955f1c1f8d31c6b60965cc659 Mon Sep 17 00:00:00 2001 From: "Sadie L. Bartholomew" Date: Tue, 18 Jan 2022 16:58:22 +0000 Subject: [PATCH 47/55] Update cf/data/dask_utils.py: document kwarg type Co-authored-by: David Hassell --- cf/data/dask_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index 42c984de4b..dc51ea9e67 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -39,7 +39,7 @@ def _da_ma_allclose(x, y, masked_equal=True, rtol=None, atol=None): y: a dask array to compare with x - masked_equal: + masked_equal: `bool`, optional Whether masked values in a and b are considered equal (True) or not (False). They are considered equal by default. From 42e62634124650a61b6d6062f109714c8ffbad46 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Tue, 18 Jan 2022 20:32:19 +0000 Subject: [PATCH 48/55] Address DH feedback by removing loop over pairs in test_Data_equals --- cf/test/test_Data.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index c1c2fe826e..6b925676a3 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -367,21 +367,19 @@ def test_Data_equals(self): self.assertTrue(k1.equals(k1.copy())) k2 = cf.Data(np.array([10.01, 20.01])) self.assertTrue(k2.equals(k2.copy())) - for ks in [(k1, k2), (k2, k1)]: # to test symmetry/commutativity - k_1, k_2 = ks - # Only one log check is sufficient here - with self.assertLogs(level=cf.log_level().value) as catch: - self.assertFalse(k_1.equals(k_2, atol=0.005, rtol=0)) - self.assertTrue( - any( - "Data: Different array values (atol=0.005, rtol=0)" - in log_msg - for log_msg in catch.output - ) + # Only one log check is sufficient here + with self.assertLogs(level=cf.log_level().value) as catch: + self.assertFalse(k1.equals(k2, atol=0.005, rtol=0)) + self.assertTrue( + any( + "Data: Different array values (atol=0.005, rtol=0)" + in log_msg + for log_msg in catch.output ) - self.assertTrue(k_1.equals(k_2, atol=0.02, rtol=0)) - self.assertFalse(k_1.equals(k_2, atol=0, rtol=0.0005)) - self.assertTrue(k_1.equals(k_2, atol=0, rtol=0.002)) + ) + self.assertTrue(k1.equals(k2, atol=0.02, rtol=0)) + self.assertFalse(k1.equals(k2, atol=0, rtol=0.0005)) + self.assertTrue(k1.equals(k2, atol=0, rtol=0.002)) # Test ignore_fill_value parameter m1 = cf.Data(1, fill_value=1000) From 3846d07e8964f54168016f82870bf40f655a6ccf Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Tue, 18 Jan 2022 21:25:43 +0000 Subject: [PATCH 49/55] Address DH feedback: do not check numpy arrays in test_Data_utils --- cf/test/test_Data_utils.py | 45 +++++++++++++------------------------- 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/cf/test/test_Data_utils.py b/cf/test/test_Data_utils.py index 788a75a29b..859e6241ee 100644 --- a/cf/test/test_Data_utils.py +++ b/cf/test/test_Data_utils.py @@ -21,67 +21,52 @@ def test_Data_Utils__da_ma_allclose(self): # and insist on the mask being identical as well as the data # (separately, i.e. unmasked) all being 'allclose', so inside our # cf.Data objects 'a' and 'a2' would instead *not* be considered equal. - a = np.ma.array([1.0, 2.0, 3.0], mask=[1, 0, 0]) - a2 = np.ma.array([10.0, 2.0, 3.0], mask=[1, 0, 0]) - b = np.ma.array([1.0, 2.0, 3.0], mask=[0, 1, 0]) - c = np.ma.array([1.0, 2.0, 100.0], mask=[1, 0, 0]) - d = np.array([1.0, 2.0, 3.0]) + a_np = np.ma.array([1.0, 2.0, 3.0], mask=[1, 0, 0]) + a = da.from_array(a_np) + a2 = da.from_array(np.ma.array([10.0, 2.0, 3.0], mask=[1, 0, 0])) + b_np = np.ma.array([1.0, 2.0, 3.0], mask=[0, 1, 0]) + b = da.from_array(b_np) + c_np = np.ma.array([1.0, 2.0, 100.0], mask=[1, 0, 0]) + c = da.from_array(c_np) + d = da.from_array(np.array([1.0, 2.0, 3.0])) e = a + 5e-04 # outside of tolerance to set, namely rtol=1e-05 f = a + 5e-06 # within set tolerance to be specified, as above # Test the function with these inputs as both numpy and dask arrays... allclose = cf.data.dask_utils._da_ma_allclose - da_ = da.from_array(a) self.assertTrue(allclose(a, a).compute()) - self.assertTrue(allclose(da_, da_).compute()) - self.assertTrue(allclose(a2, a).compute()) - self.assertTrue(allclose(da.from_array(a2), da_).compute()) - self.assertTrue(allclose(b, a).compute()) - self.assertTrue(allclose(da.from_array(b), da_).compute()) + # ...including testing the 'masked_equal' parameter self.assertFalse(allclose(b, a, masked_equal=False).compute()) - self.assertFalse( - allclose(da.from_array(b), da_, masked_equal=False).compute() - ) self.assertFalse(allclose(c, a).compute()) - self.assertFalse(allclose(da.from_array(c), da_).compute()) - self.assertTrue(allclose(d, a).compute()) - self.assertTrue(allclose(da.from_array(d), da_).compute()) - self.assertFalse(allclose(e, a).compute()) - self.assertFalse(allclose(da.from_array(e), da_).compute()) self.assertTrue(allclose(f, a, rtol=1e-05).compute()) - self.assertTrue(allclose(da.from_array(f), da_, rtol=1e-05).compute()) # Test when array inputs have different chunk sizes - da_ = da.from_array(a, chunks=(1, 2)) - self.assertTrue(allclose(da.from_array(b, chunks=(3,)), da_).compute()) + a_chunked = da.from_array(a_np, chunks=(1, 2)) + self.assertTrue( + allclose(da.from_array(b_np, chunks=(3,)), a_chunked).compute() + ) self.assertFalse( allclose( - da.from_array(b, chunks=(3,)), da_, masked_equal=False + da.from_array(b_np, chunks=(3,)), a_chunked, masked_equal=False ).compute() ) self.assertFalse( - allclose(da.from_array(c, chunks=(3,)), da_).compute() + allclose(da.from_array(c_np, chunks=(3,)), a_chunked).compute() ) # Test the 'rtol' and 'atol' parameters: self.assertFalse(allclose(e, a, rtol=1e-06).compute()) - self.assertFalse(allclose(da.from_array(e), da_, rtol=1e-06).compute()) b1 = e / 10000 b2 = a / 10000 self.assertTrue(allclose(b1, b2, atol=1e-05).compute()) - self.assertTrue( - allclose( - da.from_array(b1), da.from_array(b2), atol=1e-05 - ).compute() - ) def test_Data_Utils__is_numeric_dtype(self): """TODO.""" From 35b5adb750926af9fc8b1d733e143cdbd672835d Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Tue, 18 Jan 2022 22:09:26 +0000 Subject: [PATCH 50/55] Address DH feedback: adjust test_Daya_equals RE string dtypes --- cf/test/test_Data.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 6b925676a3..85cf3ad143 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -281,14 +281,10 @@ def test_Data_equals(self): sa2_data = np.array(["one", "two", "four"], dtype="S4") sa2 = cf.Data(sa2_data, "m") self.assertTrue(sa2.equals(sa2.copy())) - with self.assertLogs(level=cf.log_level().value) as catch: - self.assertFalse(sa1.equals(sa2)) - self.assertTrue( - any( - "Data: Different data types: |S5 != |S4" in log_msg - for log_msg in catch.output - ) - ) + # Unlike for numeric types, for string-like data as long as the data + # is the same consider the arrays equal, even if the dtype differs. + # TODO DASK: this behaviour will be added via cfdm, test fails for now + # ## self.assertTrue(sa1.equals(sa2)) sa3_data = sa2_data.astype("S5") sa3 = cf.Data(sa3_data, "m") self.assertTrue(sa3.equals(sa3.copy())) From 7cd2e891c9cb006a375a6bbd03988ec91d7e6e98 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Wed, 19 Jan 2022 21:36:02 +0000 Subject: [PATCH 51/55] Address DH feedback: use multiple dask chunks in test_Data_equals --- cf/test/test_Data.py | 80 +++++++++++++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 24 deletions(-) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 85cf3ad143..8f5cdf0175 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -136,16 +136,17 @@ def test_Data_equals(self): return shape = 3, 4 + chunksize = 2, 6 a = np.arange(12).reshape(*shape) - d = cf.Data(a, "m") + d = cf.Data(a, "m", chunks=chunksize) self.assertTrue(d.equals(d)) # check equal to self self.assertTrue(d.equals(d.copy())) # also do self-equality checks! # Different but equivalent datatype, which should *fail* the equality # test (i.e. equals return False) because we want equals to check # for strict equality, including equality of data type. - d2 = cf.Data(a.astype(np.float32), "m") + d2 = cf.Data(a.astype(np.float32), "m", chunks=chunksize) self.assertTrue(d2.equals(d2.copy())) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(d2.equals(d)) @@ -156,7 +157,7 @@ def test_Data_equals(self): ) ) - e = cf.Data(a, "s") # different units to d + e = cf.Data(a, "s", chunks=chunksize) # different units to d self.assertTrue(e.equals(e.copy())) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(e.equals(d)) @@ -167,7 +168,7 @@ def test_Data_equals(self): ) ) - f = cf.Data(np.arange(12), "m") # different shape to d + f = cf.Data(np.arange(12), "m", chunks=(6,)) # different shape to d self.assertTrue(f.equals(f.copy())) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(f.equals(d)) @@ -178,7 +179,9 @@ def test_Data_equals(self): ) ) - g = cf.Data(np.ones(shape, dtype="int64"), "m") # different values + g = cf.Data( + np.ones(shape, dtype="int64"), "m", chunks=chunksize + ) # different values self.assertTrue(g.equals(g.copy())) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(g.equals(d)) @@ -190,8 +193,8 @@ def test_Data_equals(self): ) # Test NaN values - d3 = cf.Data(a.astype(np.float64), "m") - h = cf.Data(np.full(shape, np.nan), "m") + d3 = cf.Data(a.astype(np.float64), "m", chunks=chunksize) + h = cf.Data(np.full(shape, np.nan), "m", chunks=chunksize) # TODODASK: implement and test equal_nan kwarg to configure NaN eq. self.assertFalse(h.equals(h.copy())) with self.assertLogs(level=cf.log_level().value) as catch: @@ -205,7 +208,7 @@ def test_Data_equals(self): ) # Test inf values - i = cf.Data(np.full(shape, np.inf), "m") + i = cf.Data(np.full(shape, np.inf), "m", chunks=chunksize) self.assertTrue(i.equals(i.copy())) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(i.equals(d3)) # np.inf is also of dtype float64 @@ -226,9 +229,18 @@ def test_Data_equals(self): # Test masked arrays # 1. Example case where the masks differ only (data is identical) - j1 = cf.Data(np.ma.array([1.0, 2.0, 3.0], mask=[1, 0, 0]), "m") + mask_test_chunksize = (2, 1) + j1 = cf.Data( + np.ma.array([1.0, 2.0, 3.0], mask=[1, 0, 0]), + "m", + chunks=mask_test_chunksize, + ) self.assertTrue(j1.equals(j1.copy())) - j2 = cf.Data(np.ma.array([1.0, 2.0, 3.0], mask=[0, 1, 0]), "m") + j2 = cf.Data( + np.ma.array([1.0, 2.0, 3.0], mask=[0, 1, 0]), + "m", + chunks=mask_test_chunksize, + ) self.assertTrue(j2.equals(j2.copy())) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(j1.equals(j2)) @@ -239,7 +251,11 @@ def test_Data_equals(self): ) ) # 2. Example case where the data differs only (masks are identical) - j3 = cf.Data(np.ma.array([1.0, 2.0, 100.0], mask=[1, 0, 0]), "m") + j3 = cf.Data( + np.ma.array([1.0, 2.0, 100.0], mask=[1, 0, 0]), + "m", + chunks=mask_test_chunksize, + ) self.assertTrue(j3.equals(j3.copy())) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(j1.equals(j3)) @@ -251,7 +267,9 @@ def test_Data_equals(self): ) # 3. Trivial case of data that is fully masked - j4 = cf.Data(np.ma.masked_all(shape, dtype="int"), "m") + j4 = cf.Data( + np.ma.masked_all(shape, dtype="int"), "m", chunks=chunksize + ) self.assertTrue(j4.equals(j4.copy())) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(j4.equals(d)) @@ -269,24 +287,34 @@ def test_Data_equals(self): # Note these *should* be considered equal inside cf.Data, and indeed # np.ma.allclose and our own _da_ma_allclose methods also hold # these to be 'allclose'. - j5 = cf.Data(np.ma.array([1.0, 2.0, 3.0], mask=[1, 0, 0]), "m") + j5 = cf.Data( + np.ma.array([1.0, 2.0, 3.0], mask=[1, 0, 0]), + "m", + chunks=mask_test_chunksize, + ) self.assertTrue(j5.equals(j5.copy())) - j6 = cf.Data(np.ma.array([10.0, 2.0, 3.0], mask=[1, 0, 0]), "m") + j6 = cf.Data( + np.ma.array([10.0, 2.0, 3.0], mask=[1, 0, 0]), + "m", + chunks=mask_test_chunksize, + ) self.assertTrue(j6.equals(j6.copy())) self.assertTrue(j5.equals(j6)) # Test non-numeric dtype arrays - sa1 = cf.Data(np.array(["one", "two", "three"], dtype="S5"), "m") + sa1 = cf.Data( + np.array(["one", "two", "three"], dtype="S5"), "m", chunks=(3,) + ) self.assertTrue(sa1.equals(sa1.copy())) sa2_data = np.array(["one", "two", "four"], dtype="S4") - sa2 = cf.Data(sa2_data, "m") + sa2 = cf.Data(sa2_data, "m", chunks=(3,)) self.assertTrue(sa2.equals(sa2.copy())) # Unlike for numeric types, for string-like data as long as the data # is the same consider the arrays equal, even if the dtype differs. # TODO DASK: this behaviour will be added via cfdm, test fails for now # ## self.assertTrue(sa1.equals(sa2)) sa3_data = sa2_data.astype("S5") - sa3 = cf.Data(sa3_data, "m") + sa3 = cf.Data(sa3_data, "m", chunks=mask_test_chunksize) self.assertTrue(sa3.equals(sa3.copy())) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(sa1.equals(sa3)) @@ -304,6 +332,7 @@ def test_Data_equals(self): dtype="S5", ), "m", + chunks=mask_test_chunksize, ) self.assertTrue(sa4.equals(sa4.copy())) sa5 = cf.Data( @@ -313,6 +342,7 @@ def test_Data_equals(self): dtype="S5", ), "m", + chunks=mask_test_chunksize, ) self.assertTrue(sa5.equals(sa5.copy())) with self.assertLogs(level=cf.log_level().value) as catch: @@ -325,11 +355,12 @@ def test_Data_equals(self): ) # Test where inputs are scalars - s1 = cf.Data(1) + scalar_test_chunksize = (10,) + s1 = cf.Data(1, chunks=scalar_test_chunksize) self.assertTrue(s1.equals(s1.copy())) - s2 = cf.Data(10) + s2 = cf.Data(10, chunks=scalar_test_chunksize) self.assertTrue(s2.equals(s2.copy())) - s3 = cf.Data("a_string") + s3 = cf.Data("a_string", chunks=scalar_test_chunksize) self.assertTrue(s3.equals(s3.copy())) # 1. both are scalars with self.assertLogs(level=cf.log_level().value) as catch: @@ -359,9 +390,10 @@ def test_Data_equals(self): ) # Test rtol and atol parameters - k1 = cf.Data(np.array([10.0, 20.0])) + tol_check_chunksize = 1, 1 + k1 = cf.Data(np.array([10.0, 20.0]), chunks=tol_check_chunksize) self.assertTrue(k1.equals(k1.copy())) - k2 = cf.Data(np.array([10.01, 20.01])) + k2 = cf.Data(np.array([10.01, 20.01]), chunks=tol_check_chunksize) self.assertTrue(k2.equals(k2.copy())) # Only one log check is sufficient here with self.assertLogs(level=cf.log_level().value) as catch: @@ -378,9 +410,9 @@ def test_Data_equals(self): self.assertTrue(k1.equals(k2, atol=0, rtol=0.002)) # Test ignore_fill_value parameter - m1 = cf.Data(1, fill_value=1000) + m1 = cf.Data(1, fill_value=1000, chunks=scalar_test_chunksize) self.assertTrue(m1.equals(m1.copy())) - m2 = cf.Data(1, fill_value=2000) + m2 = cf.Data(1, fill_value=2000, chunks=scalar_test_chunksize) self.assertTrue(m2.equals(m2.copy())) with self.assertLogs(level=cf.log_level().value) as catch: self.assertFalse(m1.equals(m2)) From aedec2f6b823ed6e03616c4e49d62def1214bf65 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Thu, 20 Jan 2022 00:36:59 +0000 Subject: [PATCH 52/55] Update test_Data verbosity management and testing --- cf/data/data.py | 53 +++++++++++++++++++++++--------------------- cf/test/test_Data.py | 47 ++++++++++++++++++++++++--------------- 2 files changed, 57 insertions(+), 43 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index 08eeb56daa..8140aa95fa 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -125,6 +125,9 @@ # from dask.array import Array +_DASKIFIED_VERBOSE = None # see below for valid levels, adapt as useful + + logger = logging.getLogger(__name__) @@ -643,7 +646,7 @@ def dask_compressed_array(self): return ca._get_dask().copy() - @daskified() + @daskified(_DASKIFIED_VERBOSE) def __contains__(self, value): """Membership test operator ``in`` @@ -935,7 +938,7 @@ def __repr__(self): """ return super().__repr__().replace("<", ", )" in log_msg @@ -171,7 +175,7 @@ def test_Data_equals(self): f = cf.Data(np.arange(12), "m", chunks=(6,)) # different shape to d self.assertTrue(f.equals(f.copy())) with self.assertLogs(level=cf.log_level().value) as catch: - self.assertFalse(f.equals(d)) + self.assertFalse(f.equals(d, verbose=2)) self.assertTrue( any( "Data: Different shapes: (12,) != (3, 4)" in log_msg @@ -184,7 +188,7 @@ def test_Data_equals(self): ) # different values self.assertTrue(g.equals(g.copy())) with self.assertLogs(level=cf.log_level().value) as catch: - self.assertFalse(g.equals(d)) + self.assertFalse(g.equals(d, verbose=2)) self.assertTrue( any( "Data: Different array values" in log_msg @@ -199,7 +203,7 @@ def test_Data_equals(self): self.assertFalse(h.equals(h.copy())) with self.assertLogs(level=cf.log_level().value) as catch: # Compare to d3 not d since np.nan has dtype float64 (IEEE 754) - self.assertFalse(h.equals(d3)) + self.assertFalse(h.equals(d3, verbose=2)) self.assertTrue( any( "Data: Different array values" in log_msg @@ -211,7 +215,8 @@ def test_Data_equals(self): i = cf.Data(np.full(shape, np.inf), "m", chunks=chunksize) self.assertTrue(i.equals(i.copy())) with self.assertLogs(level=cf.log_level().value) as catch: - self.assertFalse(i.equals(d3)) # np.inf is also of dtype float64 + # np.inf is also of dtype float64 (see comment on NaN tests above) + self.assertFalse(i.equals(d3, verbose=2)) self.assertTrue( any( "Data: Different array values" in log_msg @@ -219,7 +224,7 @@ def test_Data_equals(self): ) ) with self.assertLogs(level=cf.log_level().value) as catch: - self.assertFalse(h.equals(i)) + self.assertFalse(h.equals(i, verbose=2)) self.assertTrue( any( "Data: Different array values" in log_msg @@ -243,7 +248,7 @@ def test_Data_equals(self): ) self.assertTrue(j2.equals(j2.copy())) with self.assertLogs(level=cf.log_level().value) as catch: - self.assertFalse(j1.equals(j2)) + self.assertFalse(j1.equals(j2, verbose=2)) self.assertTrue( any( "Data: Different array values" in log_msg @@ -258,7 +263,7 @@ def test_Data_equals(self): ) self.assertTrue(j3.equals(j3.copy())) with self.assertLogs(level=cf.log_level().value) as catch: - self.assertFalse(j1.equals(j3)) + self.assertFalse(j1.equals(j3, verbose=2)) self.assertTrue( any( "Data: Different array values" in log_msg @@ -272,7 +277,7 @@ def test_Data_equals(self): ) self.assertTrue(j4.equals(j4.copy())) with self.assertLogs(level=cf.log_level().value) as catch: - self.assertFalse(j4.equals(d)) + self.assertFalse(j4.equals(d, verbose=2)) self.assertTrue( any( "Data: Different array values" in log_msg @@ -317,7 +322,7 @@ def test_Data_equals(self): sa3 = cf.Data(sa3_data, "m", chunks=mask_test_chunksize) self.assertTrue(sa3.equals(sa3.copy())) with self.assertLogs(level=cf.log_level().value) as catch: - self.assertFalse(sa1.equals(sa3)) + self.assertFalse(sa1.equals(sa3, verbose=2)) self.assertTrue( any( "Data: Different array values" in log_msg @@ -346,7 +351,7 @@ def test_Data_equals(self): ) self.assertTrue(sa5.equals(sa5.copy())) with self.assertLogs(level=cf.log_level().value) as catch: - self.assertFalse(sa4.equals(sa5)) + self.assertFalse(sa4.equals(sa5, verbose=2)) self.assertTrue( any( "Data: Different array values" in log_msg @@ -364,7 +369,7 @@ def test_Data_equals(self): self.assertTrue(s3.equals(s3.copy())) # 1. both are scalars with self.assertLogs(level=cf.log_level().value) as catch: - self.assertFalse(s1.equals(s2)) + self.assertFalse(s1.equals(s2, verbose=2)) self.assertTrue( any( "Data: Different array values" in log_msg @@ -372,7 +377,7 @@ def test_Data_equals(self): ) ) with self.assertLogs(level=cf.log_level().value) as catch: - self.assertFalse(s1.equals(s3)) + self.assertFalse(s1.equals(s3, verbose=2)) self.assertTrue( any( "Data: Different data types: int64 != 3.10 this can be replaced by 'assertNoLogs' method. + logger.warn("Log warning to prevent test error on empty log.") + self.assertFalse(d2.equals(d, verbose=verbosity_level)) self.assertIs( any( From 8467f6ee337bf6ab6593e61b1786c9d71dcda47b Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Thu, 20 Jan 2022 01:08:20 +0000 Subject: [PATCH 53/55] Address isort & black linting failures from master merge & PR --- cf/data/creation.py | 12 ++++-------- cf/formula_terms.py | 6 ++++-- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/cf/data/creation.py b/cf/data/creation.py index ba8a86b7e4..86ebd680b2 100644 --- a/cf/data/creation.py +++ b/cf/data/creation.py @@ -2,26 +2,22 @@ from functools import lru_cache from uuid import uuid4 -import numpy as np - import dask.array as da +import numpy as np from dask.array.core import getter, normalize_chunks, slices_from_chunks -from dask.utils import SerializableLock from dask.base import tokenize from dask.config import config +from dask.utils import SerializableLock from ..units import Units - -from .utils import chunk_shapes, chunk_positions - from . import ( FilledArray, GatheredSubarray, RaggedContiguousSubarray, - RaggedIndexedSubarray, RaggedIndexedContiguousSubarray, + RaggedIndexedSubarray, ) - +from .utils import chunk_positions, chunk_shapes # Cache of axis identities _cached_axes = {} diff --git a/cf/formula_terms.py b/cf/formula_terms.py index 7f50cc80f8..8ecb993ecf 100644 --- a/cf/formula_terms.py +++ b/cf/formula_terms.py @@ -2116,8 +2116,10 @@ def formula( then a `None` is returned for all of the tuple elements. """ - standard_name = coordinate_reference.coordinate_conversion.get_parameter( - "standard_name", None + standard_name = ( + coordinate_reference.coordinate_conversion.get_parameter( + "standard_name", None + ) ) if standard_name is not None: From f2728426e9f0c7cda4905176f33bde974ab6c9f9 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Thu, 20 Jan 2022 01:14:21 +0000 Subject: [PATCH 54/55] Address all flake linting failures except is*_small --- cf/data/abstract/compressedsubarray.py | 2 ++ cf/data/creation.py | 1 - cf/data/data.py | 2 +- cf/data/netcdfarray.py | 1 - cf/functions.py | 1 - cf/mixin/propertiesdata.py | 2 +- cf/read_write/read.py | 2 +- cf/read_write/um/umread.py | 6 +++--- 8 files changed, 8 insertions(+), 9 deletions(-) diff --git a/cf/data/abstract/compressedsubarray.py b/cf/data/abstract/compressedsubarray.py index 601abe7e93..6e49cdacc5 100644 --- a/cf/data/abstract/compressedsubarray.py +++ b/cf/data/abstract/compressedsubarray.py @@ -2,6 +2,8 @@ from functools import reduce from operator import mul +from ...functions import inspect as cf_inspect + class CompressedSubarray(abc.ABC): """Abstract base class for a compressed sub-array container.""" diff --git a/cf/data/creation.py b/cf/data/creation.py index 86ebd680b2..9dc8f93592 100644 --- a/cf/data/creation.py +++ b/cf/data/creation.py @@ -9,7 +9,6 @@ from dask.config import config from dask.utils import SerializableLock -from ..units import Units from . import ( FilledArray, GatheredSubarray, diff --git a/cf/data/data.py b/cf/data/data.py index 8140aa95fa..f45462fd0c 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -3841,7 +3841,7 @@ def _binary_operation(self, other, method): elif other is None: # Can't sensibly initialize a Data object from a bare # `None` (issue #281) - other = numpy_array(None, dtype=object) + other = np.array(None, dtype=object) other = type(self).asdata(other) diff --git a/cf/data/netcdfarray.py b/cf/data/netcdfarray.py index 54ce334441..b13b498b19 100644 --- a/cf/data/netcdfarray.py +++ b/cf/data/netcdfarray.py @@ -1,7 +1,6 @@ import cfdm from . import abstract -from .functions import _close_netcdf_file, _open_netcdf_file class NetCDFArray(cfdm.NetCDFArray, abstract.FileArray): diff --git a/cf/functions.py b/cf/functions.py index 7c115c4638..9b0a0973c4 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -33,7 +33,6 @@ from numpy import ascontiguousarray as _numpy_ascontiguousarray from numpy import isclose as _x_numpy_isclose from numpy import shape as _numpy_shape -from numpy import size as _numpy_size from numpy import take as _numpy_take from numpy import tile as _numpy_tile from numpy.ma import all as _numpy_ma_all diff --git a/cf/mixin/propertiesdata.py b/cf/mixin/propertiesdata.py index fff1365703..27c97fdea2 100644 --- a/cf/mixin/propertiesdata.py +++ b/cf/mixin/propertiesdata.py @@ -2443,7 +2443,7 @@ def varray(self): """ - raise DeprecatedError("TODODASK") + raise ValueError("TODODASK - deprecated?") # data = self.get_data(None) # if data is None: diff --git a/cf/read_write/read.py b/cf/read_write/read.py index 4e529f233b..3ce30a2a6b 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -1055,7 +1055,7 @@ def _read_a_file( fmt=fmt, word_size=word_size, endian=endian, - chunk=chunk, + chunk=chunks, select=select, ) diff --git a/cf/read_write/um/umread.py b/cf/read_write/um/umread.py index 636dc158af..a6af7bb7bd 100644 --- a/cf/read_write/um/umread.py +++ b/cf/read_write/um/umread.py @@ -1870,7 +1870,7 @@ def create_data(self): dsk[name + (0,)] = (getter, subarray, full_slice, asarray, lock) - dtype = numpy_result_type(*file_data_types) + dtype = np.result_type(*file_data_types) chunks = normalize_chunks((-1, -1), shape=data_shape, dtype=dtype) else: # -------------------------------------------------------- @@ -1921,7 +1921,7 @@ def create_data(self): lock, ) - dtype = numpy_result_type(*file_data_types) + dtype = np.result_type(*file_data_types) chunks = normalize_chunks( (1, -1, -1), shape=data_shape, dtype=dtype ) @@ -1964,7 +1964,7 @@ def create_data(self): lock, ) - dtype = numpy_result_type(*file_data_types) + dtype = np.result_type(*file_data_types) chunks = normalize_chunks( (1, 1, -1, -1), shape=data_shape, dtype=dtype ) From 07d937d4b49f2f0548c9da47ee705721c9a13d01 Mon Sep 17 00:00:00 2001 From: Sadie Louise Bartholomew Date: Fri, 21 Jan 2022 16:38:40 +0000 Subject: [PATCH 55/55] Address remaining flake linting failures: is*_small --- cf/data/creation.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/cf/data/creation.py b/cf/data/creation.py index 9dc8f93592..9adfbe3846 100644 --- a/cf/data/creation.py +++ b/cf/data/creation.py @@ -134,8 +134,9 @@ def compressed_to_dask(array): count = array.get_count().dask_array(copy=False) - if is_small(count): - count = count.compute() + # TODODASK: remove with #297 merge + # if is_small(count): + # count = count.compute() # Find the chunk sizes and positions of the uncompressed # array. Each chunk will contain the data for one instance, @@ -193,8 +194,9 @@ def compressed_to_dask(array): _, inverse = da.unique(index, return_inverse=True) - if is_very_small(index): - inverse = inverse.compute() + # TODODASK: remove with #297 merge + # if is_very_small(index): + # inverse = inverse.compute() chunks = normalize_chunks( (1,) + (-1,) * (uncompressed_ndim - 1), @@ -231,14 +233,16 @@ def compressed_to_dask(array): index = array.get_index().dask_array(copy=False) count = array.get_count().dask_array(copy=False) - if is_small(index): - index = index.compute() - index_is_dask = False - else: - index_is_dask = True + # TODODASK: remove with #297 merge + # if is_small(index): + # index = index.compute() + # index_is_dask = False + # else: + index_is_dask = True - if is_small(count): - count = count.compute() + # TODODASK: remove with #297 merge + # if is_small(count): + # count = count.compute() cumlative_count = count.cumsum(axis=0) @@ -263,8 +267,9 @@ def compressed_to_dask(array): xprofile_indices = np.where(index == i)[0] if index_is_dask: xprofile_indices.compute_chunk_sizes() - if is_small(xprofile_indices): - xprofile_indices = xprofile_indices.compute() + # TODODASK: remove with #297 merge + # if is_small(xprofile_indices): + # xprofile_indices = xprofile_indices.compute() # --- End: if # Find the number of actual profiles in this instance