tensorflow · copybara-service · Feb 21, 2024
diff --git a/RELEASE.md b/RELEASE.md
@@ -13,8 +13,6 @@
 *  `ph.make_proto()` allows constructing proto-valued placeholders, e.g. for
    larger config protos fed to a component.
 *  `ph.join_path()` is like `os.path.join()` but for placeholders.
-*  Support passing in `experimental_debug_stripper` into the Transform
-   pipeline runner.
 
 ## Breaking Changes
 

diff --git a/tfx/components/infra_validator/model_server_runners/kubernetes_runner.py b/tfx/components/infra_validator/model_server_runners/kubernetes_runner.py
@@ -93,17 +93,10 @@ def _convert_to_kube_env(
 def _convert_to_resource_requirements(
     resources: infra_validator_pb2.Resources
 ) -> k8s_client.V1ResourceRequirements:
-  if hasattr(k8s_client.V1ResourceRequirements, 'claims'):
-    return k8s_client.V1ResourceRequirements(
-        requests=dict(resources.requests),
-        limits=dict(resources.limits),
-        claims=dict(resources.claims),
-    )
-  else:
-    return k8s_client.V1ResourceRequirements(
-        requests=dict(resources.requests),
-        limits=dict(resources.limits),
-    )
+  return k8s_client.V1ResourceRequirements(
+      requests=dict(resources.requests),
+      limits=dict(resources.limits),
+  )
 
 
 class KubernetesRunner(base_runner.BaseModelServerRunner):

diff --git a/tfx/components/infra_validator/model_server_runners/kubernetes_runner_test.py b/tfx/components/infra_validator/model_server_runners/kubernetes_runner_test.py
@@ -31,7 +31,7 @@
 from google.protobuf import json_format
 
 
-def _CreateServingSpec(payload: Dict[str, Any]):
+def _create_serving_spec(payload: Dict[str, Any]):
   result = infra_validator_pb2.ServingSpec()
   json_format.ParseDict(payload, result)
   return result
@@ -194,30 +194,35 @@ def testBuildPodManifest_InsideKfp_OverrideConfig(self):
         'service_account_name': 'chocolate-latte',
         'active_deadline_seconds': 123,
         'serving_pod_overrides': {
-            'annotations': {'best_ticker': 'goog'},
-            'env': [
-                {'name': 'TICKER', 'value': 'GOOG'},
-                {'name': 'NAME_ONLY'},
-                {
-                    'name': 'SECRET',
-                    'value_from': {
-                        'secret_key_ref': {'name': 'my_secret', 'key': 'my_key'}
-                    },
-                },
-            ],
-            'resources': {
-                # TODO(b/328171600): Uncomment when version of kubernetes
-                # is matched with TFX. Kubernetes >= 26 supports 'claims' field,
-                # while TFX is at version 12.
-                'claims': {},
-                'requests': {'memory': '2Gi', 'cpu': '1'},
-                'limits': {'memory': '4Gi', 'cpu': '2'},
+            'annotations': {
+                'best_ticker': 'goog'
             },
-        },
+            'env': [{
+                'name': 'TICKER',
+                'value': 'GOOG'
+            }, {
+                'name': 'NAME_ONLY'
+            }, {
+                'name': 'SECRET',
+                'value_from': {
+                    'secret_key_ref': {
+                        'name': 'my_secret',
+                        'key': 'my_key'
+                    }
+                }
+            }],
+            'resources': {
+                'requests': {
+                    'memory': '2Gi',
+                    'cpu': '1'
+                },
+                'limits': {
+                    'memory': '4Gi',
+                    'cpu': '2'
+                },
+            }
+        }
     }
-    if not hasattr(k8s_client.V1ResourceRequirements, 'claims'):
-      k8s_config_dict['serving_pod_overrides']['resources'].pop('claims')
-
     runner = self._CreateKubernetesRunner(k8s_config_dict=k8s_config_dict)
 
     # Act.

diff --git a/tfx/components/statistics_gen/executor.py b/tfx/components/statistics_gen/executor.py
@@ -72,10 +72,6 @@ def Do(
           not also contain a schema.
         - exclude_splits: JSON-serialized list of names of splits where
           statistics and sample should not be generated.
-        - sample_rate_by_split: Optionally, A dict mapping split_name to sample
-          rate, which is used to apply a different sample rate to the
-          corresponding split. When this is supplied, it will overwrite the
-          single sample rate on stats_options_json.
 
     Raises:
       ValueError when a schema is provided both as an input and as part of the
@@ -103,16 +99,6 @@ def Do(
           % type(exclude_splits)
       )
 
-    # Load sample_rate_by_split from execution properties.
-    sample_rate_by_split = (
-        json_utils.loads(
-            exec_properties.get(
-                standard_component_specs.SAMPLE_RATE_BY_SPLIT_KEY, 'null'
-            )
-        )
-        or {}
-    )
-
     # Setup output splits.
     examples = artifact_utils.get_single_instance(
         input_dict[standard_component_specs.EXAMPLES_KEY]
@@ -131,14 +117,6 @@ def Do(
       splits = artifact_utils.decode_split_names(examples.split_names)
 
     split_names = [split for split in splits if split not in exclude_splits]
-
-    # Check if sample_rate_by_split contains invalid split names
-    for split in sample_rate_by_split:
-      if split not in split_names:
-        logging.error(
-            'Split %s provided in sample_rate_by_split is not valid.', split
-        )
-
     statistics_artifact = artifact_utils.get_single_instance(
         output_dict[standard_component_specs.STATISTICS_KEY]
     )
@@ -155,14 +133,13 @@ def Do(
       )
     except Exception as e:  # pylint: disable=broad-except
       # log on failures to not bring down Statsgen jobs
-      logging.exception('Failed to generate stats dashboard link because %s', e)
+      logging.error('Failed to generate stats dashboard link because %s', e)
       statistics_artifact.set_string_custom_property(STATS_DASHBOARD_LINK, '')
 
     stats_options = options.StatsOptions()
     stats_options_json = exec_properties.get(
         standard_component_specs.STATS_OPTIONS_JSON_KEY
     )
-
     if stats_options_json:
       # TODO(b/150802589): Move jsonable interface to tfx_bsl and use
       # json_utils
@@ -232,15 +209,6 @@ def Do(
         )
         binary_stats_output_path = os.path.join(output_uri, DEFAULT_FILE_NAME)
 
-        # Update sample rate for each split in stats_options if
-        # sample_rate_by_split is provided
-        split_stats_options = tfdv.StatsOptions.from_json(
-            stats_options.to_json())
-        if sample_rate_by_split:
-          sample_rate = sample_rate_by_split.get(split, None)
-          if sample_rate is not None:
-            split_stats_options.sample_rate = sample_rate
-
         data = p | 'TFXIORead[%s]' % split >> tfxio.BeamSource()
         if write_sharded_output:
           sharded_stats_output_prefix = os.path.join(
@@ -259,7 +227,7 @@ def Do(
         _ = (
             data
             | 'GenerateStatistics[%s]' % split
-            >> tfdv.GenerateStatistics(split_stats_options)
+            >> tfdv.GenerateStatistics(stats_options)
             | 'WriteStatsOutput[%s]' % split >> write_transform
         )
         logging.info(

diff --git a/tfx/components/statistics_gen/executor_test.py b/tfx/components/statistics_gen/executor_test.py
@@ -34,34 +34,18 @@
         'testcase_name': 'no_sharded_output',
         'sharded_output': False,
         'custom_split_uri': False,
-        'sample_rate_by_split': 'null',
     },
     {
         'testcase_name': 'custom_split_uri',
         'sharded_output': False,
         'custom_split_uri': True,
-        'sample_rate_by_split': 'null',
-    },
-    {
-        'testcase_name': 'sample_rate_by_split',
-        'sharded_output': False,
-        'custom_split_uri': False,
-        # set a higher sample rate since test data is small
-        'sample_rate_by_split': '{"train": 0.4, "eval": 0.6}',
-    },
-    {
-        'testcase_name': 'sample_rate_split_nonexist',
-        'sharded_output': False,
-        'custom_split_uri': False,
-        'sample_rate_by_split': '{"test": 0.05}',
     },
 ]
 if tfdv.default_sharded_output_supported():
   _EXECUTOR_TEST_PARAMS.append({
       'testcase_name': 'yes_sharded_output',
       'sharded_output': True,
       'custom_split_uri': False,
-      'sample_rate_by_split': 'null',
   })
 _TEST_SPAN_NUMBER = 16000
 
@@ -91,12 +75,7 @@ def _validate_sharded_stats_output(self, stats_prefix):
     self._validate_stats(stats)
 
   @parameterized.named_parameters(*_EXECUTOR_TEST_PARAMS)
-  def testDo(
-      self,
-      sharded_output: bool,
-      custom_split_uri: bool,
-      sample_rate_by_split: str,
-  ):
+  def testDo(self, sharded_output: bool, custom_split_uri: bool):
     source_data_dir = os.path.join(
         os.path.dirname(os.path.dirname(__file__)), 'testdata')
     output_data_dir = os.path.join(
@@ -129,9 +108,10 @@ def testDo(
 
     exec_properties = {
         # List needs to be serialized before being passed into Do function.
-        standard_component_specs.EXCLUDE_SPLITS_KEY: json_utils.dumps(['test']),
-        standard_component_specs.SHARDED_STATS_OUTPUT_KEY: sharded_output,
-        standard_component_specs.SAMPLE_RATE_BY_SPLIT_KEY: sample_rate_by_split,
+        standard_component_specs.EXCLUDE_SPLITS_KEY:
+            json_utils.dumps(['test']),
+        standard_component_specs.SHARDED_STATS_OUTPUT_KEY:
+            sharded_output,
     }
 
     # Create output dict.