diff --git a/.bazelrc b/.bazelrc
index 02dec0349c4741..c17ae4494dc99c 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -226,12 +226,13 @@ build:cuda --repo_env TF_NEED_CUDA=1
 build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda --@local_config_cuda//:enable_cuda
 
+build:no_cuda_libs --@local_config_cuda//cuda:include_hermetic_cuda_libs=false
+
 # CUDA: This config refers to building CUDA op kernels with clang.
 build:cuda_clang --config=cuda
-# Enable TensorRT optimizations https://developer.nvidia.com/tensorrt
-build:cuda_clang --config=tensorrt
 build:cuda_clang --action_env=TF_CUDA_CLANG="1"
 build:cuda_clang --@local_config_cuda//:cuda_compiler=clang
+build:cuda_clang --copt=-Qunused-arguments
 # Select supported compute capabilities (supported graphics cards).
 # This is the same as the official TensorFlow builds.
 # See https://developer.nvidia.com/cuda-gpus#compute
@@ -244,12 +245,10 @@ build:cuda_clang --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_60,sm_70,sm_80,sm_8
 
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 build:cuda_clang_official --config=cuda_clang
-build:cuda_clang_official --action_env=TF_CUDA_VERSION="12"
-build:cuda_clang_official --action_env=TF_CUDNN_VERSION="8"
-build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.3"
+build:cuda_clang_official --action_env=TF_CUDA_VERSION="12.3"
+build:cuda_clang_official --action_env=TF_CUDNN_VERSION="8.9"
 build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
-build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 build:cuda_clang_official --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
 
 # Build with nvcc for CUDA and clang for host
@@ -545,10 +544,6 @@ build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
 # For Remote build execution -- GPU configuration
 build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
-build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.17-clang_config_cuda"
-build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.17-clang_config_tensorrt"
-build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.17-clang_config_nccl"
-test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
 build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
 build:rbe_linux_cuda_nvcc --config=nvcc_clang
@@ -633,7 +628,6 @@ build:release_cpu_linux_base --repo_env=BAZEL_COMPILER="/usr/lib/llvm-17/bin/cla
 # Test-related settings below this point.
 test:release_linux_base --build_tests_only --keep_going --test_output=errors --verbose_failures=true
 test:release_linux_base --local_test_jobs=HOST_CPUS
-test:release_linux_base --test_env=LD_LIBRARY_PATH
 # Give only the list of failed tests at the end of the log
 test:release_linux_base --test_summary=short
 
@@ -645,7 +639,6 @@ build:release_gpu_linux --config=release_cpu_linux
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 # Note that linux cpu and cuda builds share the same toolchain now.
 build:release_gpu_linux --config=cuda_clang_official
-test:release_gpu_linux --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 # Local test jobs has to be 4 because parallel_gpu_execute is fragile, I think
 test:release_gpu_linux --test_timeout=300,450,1200,3600 --local_test_jobs=4 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute
 
@@ -674,11 +667,8 @@ test:unsupported_cpu_linux --config=release_base
 build:unsupported_gpu_linux --config=cuda
 build:unsupported_gpu_linux --config=unsupported_cpu_linux
 build:unsupported_gpu_linux --action_env=TF_CUDA_VERSION="11"
-build:unsupported_gpu_linux --action_env=TF_CUDNN_VERSION="8"
+build:unsupported_gpu_linux --action_env=TF_CUDNN_VERSION="8.6"
 build:unsupported_gpu_linux --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_50,sm_60,sm_70,sm_75,compute_80"
-build:unsupported_gpu_linux --config=tensorrt
-build:unsupported_gpu_linux --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-11.2"
-build:unsupported_gpu_linux --action_env=LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.1/lib64:/usr/local/tensorrt/lib"
 build:unsupported_gpu_linux --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:unsupported_gpu_linux --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain
 
diff --git a/ci/official/wheel.sh b/ci/official/wheel.sh
index e2a828bfacce35..5ae0feca48f646 100755
--- a/ci/official/wheel.sh
+++ b/ci/official/wheel.sh
@@ -27,7 +27,8 @@ if [[ "$TFCI_NIGHTLY_UPDATE_VERSION_ENABLE" == 1 ]]; then
   export TFCI_BUILD_PIP_PACKAGE_ARGS="$(echo $TFCI_BUILD_PIP_PACKAGE_ARGS | sed 's/tensorflow/tf_nightly/')"
 fi
 
-tfrun bazel build $TFCI_BAZEL_COMMON_ARGS //tensorflow/tools/pip_package:wheel $TFCI_BUILD_PIP_PACKAGE_ARGS
+tfrun bazel build $TFCI_BAZEL_COMMON_ARGS --config=no_cuda_libs \
+//tensorflow/tools/pip_package:wheel $TFCI_BUILD_PIP_PACKAGE_ARGS
 tfrun find ./bazel-bin/tensorflow/tools/pip_package -iname "*.whl" -exec cp {} $TFCI_OUTPUT_DIR \;
 tfrun ./ci/official/utilities/rename_and_verify_wheels.sh
 
diff --git a/configure.py b/configure.py
index 0081eeabf66bcc..01e7f343872797 100644
--- a/configure.py
+++ b/configure.py
@@ -16,7 +16,6 @@
 
 import argparse
 import errno
-import glob
 import json
 import os
 import platform
@@ -239,7 +238,7 @@ def setup_python(environ_cp):
   write_to_bazelrc('build --python_path=\"{}"'.format(python_bin_path))
   environ_cp['PYTHON_BIN_PATH'] = python_bin_path
 
-  # If choosen python_lib_path is from a path specified in the PYTHONPATH
+  # If chosen python_lib_path is from a path specified in the PYTHONPATH
   # variable, need to tell bazel to include PYTHONPATH
   if environ_cp.get('PYTHONPATH'):
     python_paths = environ_cp.get('PYTHONPATH').split(':')
@@ -775,11 +774,6 @@ def get_ndk_api_level(environ_cp, android_ndk_home_path):
 def set_gcc_host_compiler_path(environ_cp):
   """Set GCC_HOST_COMPILER_PATH."""
   default_gcc_host_compiler_path = which('gcc') or ''
-  cuda_bin_symlink = '%s/bin/gcc' % environ_cp.get('CUDA_TOOLKIT_PATH')
-
-  if os.path.islink(cuda_bin_symlink):
-    # os.readlink is only available in linux
-    default_gcc_host_compiler_path = os.path.realpath(cuda_bin_symlink)
 
   gcc_host_compiler_path = prompt_loop_or_load_from_env(
       environ_cp,
@@ -937,17 +931,6 @@ def disable_clang_offsetof_extension(clang_version):
     write_to_bazelrc('build --copt=-Wno-gnu-offsetof-extensions')
 
 
-def set_tf_cuda_paths(environ_cp):
-  """Set TF_CUDA_PATHS."""
-  ask_cuda_paths = (
-      'Please specify the comma-separated list of base paths to look for CUDA '
-      'libraries and headers. [Leave empty to use the default]: ')
-  tf_cuda_paths = get_from_env_or_user_or_default(environ_cp, 'TF_CUDA_PATHS',
-                                                  ask_cuda_paths, '')
-  if tf_cuda_paths:
-    environ_cp['TF_CUDA_PATHS'] = tf_cuda_paths
-
-
 def set_tf_cuda_version(environ_cp):
   """Set TF_CUDA_VERSION."""
   ask_cuda_version = (
@@ -972,73 +955,10 @@ def set_tf_cudnn_version(environ_cp):
   environ_cp['TF_CUDNN_VERSION'] = tf_cudnn_version
 
 
-def set_tf_tensorrt_version(environ_cp):
-  """Set TF_TENSORRT_VERSION."""
-  if not (is_linux() or is_windows()):
-    raise ValueError('Currently TensorRT is only supported on Linux platform.')
-
-  if not int(environ_cp.get('TF_NEED_TENSORRT', False)):
-    return
-
-  ask_tensorrt_version = (
-      'Please specify the TensorRT version you want to use. '
-      '[Leave empty to default to TensorRT %s]: ') % _DEFAULT_TENSORRT_VERSION
-  tf_tensorrt_version = get_from_env_or_user_or_default(
-      environ_cp, 'TF_TENSORRT_VERSION', ask_tensorrt_version,
-      _DEFAULT_TENSORRT_VERSION)
-  environ_cp['TF_TENSORRT_VERSION'] = tf_tensorrt_version
-
-
-def set_tf_nccl_version(environ_cp):
-  """Set TF_NCCL_VERSION."""
-  if not is_linux():
-    raise ValueError('Currently NCCL is only supported on Linux platform.')
-
-  if 'TF_NCCL_VERSION' in environ_cp:
-    return
-
-  ask_nccl_version = (
-      'Please specify the locally installed NCCL version you want to use. '
-      '[Leave empty to use http://github.com/nvidia/nccl]: ')
-  tf_nccl_version = get_from_env_or_user_or_default(environ_cp,
-                                                    'TF_NCCL_VERSION',
-                                                    ask_nccl_version, '')
-  environ_cp['TF_NCCL_VERSION'] = tf_nccl_version
-
-
-def get_native_cuda_compute_capabilities(environ_cp):
-  """Get native cuda compute capabilities.
-
-  Args:
-    environ_cp: copy of the os.environ.
-
-  Returns:
-    string of native cuda compute capabilities, separated by comma.
-  """
-  device_query_bin = os.path.join(
-      environ_cp.get('CUDA_TOOLKIT_PATH'), 'extras/demo_suite/deviceQuery')
-  if os.path.isfile(device_query_bin) and os.access(device_query_bin, os.X_OK):
-    try:
-      output = run_shell(device_query_bin).split('\n')
-      pattern = re.compile('[0-9]*\\.[0-9]*')
-      output = [pattern.search(x) for x in output if 'Capability' in x]
-      output = ','.join(x.group() for x in output if x is not None)
-    except subprocess.CalledProcessError:
-      output = ''
-  else:
-    output = ''
-  return output
-
-
 def set_tf_cuda_compute_capabilities(environ_cp):
   """Set TF_CUDA_COMPUTE_CAPABILITIES."""
   while True:
-    native_cuda_compute_capabilities = get_native_cuda_compute_capabilities(
-        environ_cp)
-    if not native_cuda_compute_capabilities:
-      default_cuda_compute_capabilities = _DEFAULT_CUDA_COMPUTE_CAPABILITIES
-    else:
-      default_cuda_compute_capabilities = native_cuda_compute_capabilities
+    default_cuda_compute_capabilities = _DEFAULT_CUDA_COMPUTE_CAPABILITIES
 
     ask_cuda_compute_capabilities = (
         'Please specify a list of comma-separated CUDA compute capabilities '
@@ -1217,73 +1137,6 @@ def configure_ios(environ_cp):
     symlink_force(filepath, new_filepath)
 
 
-def validate_cuda_config(environ_cp):
-  """Run find_cuda_config.py and return cuda_toolkit_path, or None."""
-
-  def maybe_encode_env(env):
-    """Encodes unicode in env to str on Windows python 2.x."""
-    if not is_windows() or sys.version_info[0] != 2:
-      return env
-    for k, v in env.items():
-      if isinstance(k, unicode):
-        k = k.encode('ascii')
-      if isinstance(v, unicode):
-        v = v.encode('ascii')
-      env[k] = v
-    return env
-
-  cuda_libraries = ['cuda', 'cudnn']
-  if is_linux():
-    if int(environ_cp.get('TF_NEED_TENSORRT', False)):
-      cuda_libraries.append('tensorrt')
-    if environ_cp.get('TF_NCCL_VERSION', None):
-      cuda_libraries.append('nccl')
-  if is_windows():
-    if int(environ_cp.get('TF_NEED_TENSORRT', False)):
-      cuda_libraries.append('tensorrt')
-      print('WARNING: TensorRT support on Windows is experimental\n')
-
-  paths = glob.glob('**/third_party/gpus/find_cuda_config.py', recursive=True)
-  if not paths:
-    raise FileNotFoundError(
-        "Can't find 'find_cuda_config.py' script inside working directory")
-  proc = subprocess.Popen(
-      [environ_cp['PYTHON_BIN_PATH'], paths[0]] + cuda_libraries,
-      stdout=subprocess.PIPE,
-      env=maybe_encode_env(environ_cp))
-
-  if proc.wait():
-    # Errors from find_cuda_config.py were sent to stderr.
-    print('Asking for detailed CUDA configuration...\n')
-    return False
-
-  config = dict(
-      tuple(line.decode('ascii').rstrip().split(': ')) for line in proc.stdout)
-
-  print('Found CUDA %s in:' % config['cuda_version'])
-  print('    %s' % config['cuda_library_dir'])
-  print('    %s' % config['cuda_include_dir'])
-
-  print('Found cuDNN %s in:' % config['cudnn_version'])
-  print('    %s' % config['cudnn_library_dir'])
-  print('    %s' % config['cudnn_include_dir'])
-
-  if 'tensorrt_version' in config:
-    print('Found TensorRT %s in:' % config['tensorrt_version'])
-    print('    %s' % config['tensorrt_library_dir'])
-    print('    %s' % config['tensorrt_include_dir'])
-
-  if config.get('nccl_version', None):
-    print('Found NCCL %s in:' % config['nccl_version'])
-    print('    %s' % config['nccl_library_dir'])
-    print('    %s' % config['nccl_include_dir'])
-
-  print('\n')
-
-  environ_cp['CUDA_TOOLKIT_PATH'] = config['cuda_toolkit_path']
-  return True
-
-
 def get_gcc_compiler(environ_cp):
   gcc_env = environ_cp.get('CXX') or environ_cp.get('CC') or which('gcc')
   if gcc_env is not None:
@@ -1388,57 +1241,20 @@ def main():
   if (environ_cp.get('TF_NEED_CUDA') == '1' and
       'TF_CUDA_CONFIG_REPO' not in environ_cp):
 
-    set_action_env_var(
-        environ_cp,
-        'TF_NEED_TENSORRT',
-        'TensorRT',
-        False,
-        bazel_config_name='tensorrt')
-
-    environ_save = dict(environ_cp)
     for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
 
-      if validate_cuda_config(environ_cp):
-        cuda_env_names = [
-            'TF_CUDA_VERSION',
-            'TF_CUBLAS_VERSION',
-            'TF_CUDNN_VERSION',
-            'TF_TENSORRT_VERSION',
-            'TF_NCCL_VERSION',
-            'TF_CUDA_PATHS',
-            # Items below are for backwards compatibility when not using
-            # TF_CUDA_PATHS.
-            'CUDA_TOOLKIT_PATH',
-            'CUDNN_INSTALL_PATH',
-            'NCCL_INSTALL_PATH',
-            'NCCL_HDR_PATH',
-            'TENSORRT_INSTALL_PATH'
-        ]
-        # Note: set_action_env_var above already writes to bazelrc.
-        for name in cuda_env_names:
-          if name in environ_cp:
-            write_action_env_to_bazelrc(name, environ_cp[name])
-        break
-
-      # Restore settings changed below if CUDA config could not be validated.
-      environ_cp = dict(environ_save)
-
-      set_tf_cuda_version(environ_cp)
-      set_tf_cudnn_version(environ_cp)
-      if is_windows():
-        set_tf_tensorrt_version(environ_cp)
-      if is_linux():
-        set_tf_tensorrt_version(environ_cp)
-        set_tf_nccl_version(environ_cp)
-
-      set_tf_cuda_paths(environ_cp)
+      cuda_env_names = [
+          'TF_CUDA_VERSION',
+          'TF_CUDNN_VERSION',
+      ]
+      # Note: set_action_env_var above already writes to bazelrc.
+      for name in cuda_env_names:
+        if name in environ_cp:
+          write_action_env_to_bazelrc(name, environ_cp[name])
+      break
 
-    else:
-      raise UserInputError(
-          'Invalid CUDA setting were provided %d '
-          'times in a row. Assuming to be a scripting mistake.'
-          % _DEFAULT_PROMPT_ASK_ATTEMPTS
-      )
+    set_tf_cuda_version(environ_cp)
+    set_tf_cudnn_version(environ_cp)
 
     set_tf_cuda_compute_capabilities(environ_cp)
     if 'LD_LIBRARY_PATH' in environ_cp and environ_cp.get(
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/tanh.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/tanh.mlir
index 2d3c8e6f5b9ef7..67f5a036fccc32 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/tanh.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/tanh.mlir
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../../../../cuda_nvcc"
 // RUN: hlo_to_kernel --input=%s --output=%t --unroll_factors=4 --tile_sizes=256 --arch=sm_70
 
 func.func @tanh(%arg0: tensor<*xf32>) -> tensor<*xf32> attributes {tf_entry} {
diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index 84ec94ba673ff8..6de9910ca2233f 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -158,6 +158,7 @@ tf_cuda_library(
         "@local_config_cuda//cuda:cudnn_header",
         "@local_xla//xla/stream_executor/cuda:cuda_platform",
         "@local_xla//xla/stream_executor/gpu:gpu_stream",
+        "@local_xla//xla/tsl:gpu_runtime_hermetic_cuda_deps",
     ],
     defines = if_linux_x86_64(["TF_PLATFORM_LINUX_X86_64"]),
     features = ["-layering_check"],
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index b2645a331739e3..7c3947fa49e861 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -207,6 +207,8 @@ tf_staging/third_party/compute_library/BUILD:
 tf_staging/third_party/compute_library/build_defs.bzl:
 tf_staging/third_party/coremltools.BUILD:
 tf_staging/third_party/cub.BUILD:
+tf_staging/third_party/cuda_redist_json_repo.bzl:
+tf_staging/third_party/cuda_repo.bzl:
 tf_staging/third_party/curl.BUILD:
 tf_staging/third_party/cython.BUILD:
 tf_staging/third_party/ducc/BUILD:
@@ -233,6 +235,7 @@ tf_staging/third_party/googleapis/build_rules.bzl:
 tf_staging/third_party/googleapis/googleapis.BUILD:
 tf_staging/third_party/googleapis/repository_rules.bzl:
 tf_staging/third_party/gpus/BUILD:
+tf_staging/third_party/gpus/compiler_common_tools.bzl:
 tf_staging/third_party/gpus/crosstool/BUILD.rocm.tpl:
 tf_staging/third_party/gpus/crosstool/BUILD.sycl.tpl:
 tf_staging/third_party/gpus/crosstool/BUILD.tpl:
@@ -243,15 +246,32 @@ tf_staging/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tp
 tf_staging/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_sycl.tpl:
 tf_staging/third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl:
 tf_staging/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl:
+tf_staging/third_party/gpus/cuda/BUILD.hermetic.tpl:
 tf_staging/third_party/gpus/cuda/BUILD.tpl:
 tf_staging/third_party/gpus/cuda/BUILD.windows.tpl:
 tf_staging/third_party/gpus/cuda/BUILD:
 tf_staging/third_party/gpus/cuda/LICENSE:
 tf_staging/third_party/gpus/cuda/build_defs.bzl.tpl:
+tf_staging/third_party/gpus/cuda/cuda_cccl.BUILD:
 tf_staging/third_party/gpus/cuda/cuda_config.h.tpl:
 tf_staging/third_party/gpus/cuda/cuda_config.py.tpl:
+tf_staging/third_party/gpus/cuda/cuda_cublas.BUILD.tpl:
+tf_staging/third_party/gpus/cuda/cuda_cudart.BUILD.tpl:
+tf_staging/third_party/gpus/cuda/cuda_cudnn.BUILD.tpl:
+tf_staging/third_party/gpus/cuda/cuda_cufft.BUILD.tpl:
+tf_staging/third_party/gpus/cuda/cuda_cupti.BUILD.tpl:
+tf_staging/third_party/gpus/cuda/cuda_curand.BUILD.tpl:
+tf_staging/third_party/gpus/cuda/cuda_cusolver.BUILD.tpl:
+tf_staging/third_party/gpus/cuda/cuda_cusparse.BUILD.tpl:
+tf_staging/third_party/gpus/cuda/cuda_nccl.BUILD:
+tf_staging/third_party/gpus/cuda/cuda_nvcc.BUILD:
+tf_staging/third_party/gpus/cuda/cuda_nvjitlink.BUILD.tpl:
+tf_staging/third_party/gpus/cuda/cuda_nvml.BUILD:
+tf_staging/third_party/gpus/cuda/cuda_nvprune.BUILD:
+tf_staging/third_party/gpus/cuda/cuda_nvtx.BUILD:
 tf_staging/third_party/gpus/cuda_configure.bzl:
 tf_staging/third_party/gpus/find_cuda_config:.py
+tf_staging/third_party/gpus/hermetic_cuda_configure.bzl:
 tf_staging/third_party/gpus/rocm/BUILD.tpl:
 tf_staging/third_party/gpus/rocm/BUILD:
 tf_staging/third_party/gpus/rocm/build_defs.bzl.tpl:
@@ -283,6 +303,7 @@ tf_staging/third_party/nccl/archive.BUILD:
 tf_staging/third_party/nccl/archive.patch:
 tf_staging/third_party/nccl/build_defs.bzl.tpl:
 tf_staging/third_party/nccl/generated_names.bzl.tpl:
+tf_staging/third_party/nccl/hermetic_nccl_configure.bzl:
 tf_staging/third_party/nccl/nccl_configure.bzl:
 tf_staging/third_party/nccl/system.BUILD.tpl:
 tf_staging/third_party/nlohmann_json.BUILD:
diff --git a/tensorflow/python/framework/BUILD b/tensorflow/python/framework/BUILD
index bac9403d63dc27..eff65990c58c38 100644
--- a/tensorflow/python/framework/BUILD
+++ b/tensorflow/python/framework/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 # Placeholder: load py_proto_library
 load(
     "//tensorflow:tensorflow.bzl",
+    "if_hermetic_cuda_tools",
     "if_not_windows",
     "if_oss",
     "if_xla_available",
@@ -1045,6 +1046,13 @@ tf_python_pybind_extension(
         "python_api_dispatcher.h",
         "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
     ],
+    # This data is needed to add hermetic CUDA tools in python runfiles.
+    data = if_hermetic_cuda_tools(
+        [
+            "@cuda_nvcc//:ptxas",
+            "@cuda_nvcc//:nvvm",
+        ],
+    ),
     enable_stub_generation = True,
     pytype_srcs = [
         "_pywrap_python_api_dispatcher.pyi",
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 6e656b861bedaf..0bf6a91f995007 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -73,6 +73,7 @@ load(
     "tsl_gpu_library",
     _cc_header_only_library = "cc_header_only_library",
     _if_cuda_or_rocm = "if_cuda_or_rocm",
+    _if_hermetic_cuda_tools = "if_hermetic_cuda_tools",
     _if_nccl = "if_nccl",
     _transitive_hdrs = "transitive_hdrs",
 )
@@ -803,7 +804,7 @@ def tf_cc_shared_object(
     testonly = kwargs.pop("testonly", False)
 
     for name_os, name_os_major, name_os_full in names:
-        # Windows DLLs cant be versioned
+        # Windows DLLs can't be versioned
         if name_os.endswith(".dll"):
             name_os_major = name_os
             name_os_full = name_os
@@ -3578,3 +3579,6 @@ def replace_with_portable_tf_lib_when_required(non_portable_tf_deps, use_lib_wit
 
 def tf_python_framework_friends():
     return ["//tensorflow:__subpackages__"]
+
+def if_hermetic_cuda_tools(if_true, if_false = []):
+    return _if_hermetic_cuda_tools(if_true, if_false)
diff --git a/tensorflow/tools/pip_package/build_pip_package.py b/tensorflow/tools/pip_package/build_pip_package.py
index 10593503395532..6ad413acca561d 100644
--- a/tensorflow/tools/pip_package/build_pip_package.py
+++ b/tensorflow/tools/pip_package/build_pip_package.py
@@ -69,6 +69,34 @@ def prepare_headers(headers: list[str], srcs_dir: str) -> None:
     srcs_dir: target directory where headers are copied to.
   """
   path_to_exclude = [
+      "cuda_cccl/_virtual_includes",
+      "cuda_cublas/_virtual_includes",
+      "cuda_cudart/_virtual_includes",
+      "cuda_cudnn/_virtual_includes",
+      "cuda_cufft/_virtual_includes",
+      "cuda_cupti/_virtual_includes",
+      "cuda_curand/_virtual_includes",
+      "cuda_cusolver/_virtual_includes",
+      "cuda_cusparse/_virtual_includes",
+      "cuda_nccl/_virtual_includes",
+      "cuda_nvcc/_virtual_includes",
+      "cuda_nvjitlink/_virtual_includes",
+      "cuda_nvml/_virtual_includes",
+      "cuda_nvtx/_virtual_includes",
+      "external/cuda_cccl",
+      "external/cuda_cublas",
+      "external/cuda_cudart",
+      "external/cuda_cudnn",
+      "external/cuda_cufft",
+      "external/cuda_cupti",
+      "external/cuda_curand",
+      "external/cuda_cusolver",
+      "external/cuda_cusparse",
+      "external/cuda_nccl",
+      "external/cuda_nvcc",
+      "external/cuda_nvjitlink",
+      "external/cuda_nvml",
+      "external/cuda_nvtx",
       "external/pypi",
       "external/jsoncpp_git/src",
       "local_config_cuda/cuda/_virtual_includes",
diff --git a/tensorflow/tools/toolchains/remote_config/configs.bzl b/tensorflow/tools/toolchains/remote_config/configs.bzl
index 41fe389aa09cad..48473d49e84ce2 100644
--- a/tensorflow/tools/toolchains/remote_config/configs.bzl
+++ b/tensorflow/tools/toolchains/remote_config/configs.bzl
@@ -703,7 +703,7 @@ def initialize_rbe_configs():
             "TF_CUDNN_VERSION": "8.9",
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_SYSROOT": "/dt9",
             "TF_TENSORRT_VERSION": "8.6",
         },
@@ -742,7 +742,7 @@ def initialize_rbe_configs():
             "TF_CUDNN_VERSION": "8.9",
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_SYSROOT": "/dt9",
             "TF_TENSORRT_VERSION": "8.6",
         },
@@ -782,7 +782,7 @@ def initialize_rbe_configs():
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
             "TF_SYSROOT": "/dt9",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_TENSORRT_VERSION": "8.6",
         },
     )
@@ -820,7 +820,7 @@ def initialize_rbe_configs():
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
             "TF_SYSROOT": "/dt9",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_TENSORRT_VERSION": "8.6",
         },
     )
diff --git a/tensorflow/tools/toolchains/remote_config/rbe_config.bzl b/tensorflow/tools/toolchains/remote_config/rbe_config.bzl
index ae776c2a2fd388..317e7139e5f96b 100644
--- a/tensorflow/tools/toolchains/remote_config/rbe_config.bzl
+++ b/tensorflow/tools/toolchains/remote_config/rbe_config.bzl
@@ -1,9 +1,15 @@
 """Macro that creates external repositories for remote config."""
 
 load("//tensorflow/tools/toolchains/remote_config:containers.bzl", "containers")
-load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
+
+# If you need to use non-hermetic CUDA, replace the line below with
+# load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
+load("//third_party/gpus:hermetic_cuda_configure.bzl", "hermetic_cuda_configure")
 load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
-load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
+
+# If you need to use non-hermetic CUDA, replace the line below with
+# load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
+load("//third_party/nccl:hermetic_nccl_configure.bzl", "hermetic_nccl_configure")
 load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
 load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//third_party/tensorrt:tensorrt_configure.bzl", "remote_tensorrt_configure")
@@ -42,7 +48,7 @@ def _tensorflow_rbe_config(name, compiler, python_versions, os, rocm_version = N
             "TF_CUDNN_VERSION": cudnn_version,
             "TF_CUDA_VERSION": cuda_version,
             "CUDNN_INSTALL_PATH": cudnn_install_path if cudnn_install_path != None else "/usr/lib/x86_64-linux-gnu",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_TENSORRT_VERSION": tensorrt_version if tensorrt_version != None else "",
             "TENSORRT_INSTALL_PATH": tensorrt_install_path if tensorrt_install_path != None else "/usr/lib/x86_64-linux-gnu",
             "GCC_HOST_COMPILER_PATH": compiler if not compiler.endswith("clang") else "",
@@ -58,13 +64,17 @@ def _tensorflow_rbe_config(name, compiler, python_versions, os, rocm_version = N
             "Pool": "default",
         }
 
-        remote_cuda_configure(
+        # If you need to use non-hermetic CUDA, replace the call below with
+        # remote_cuda_configure.
+        hermetic_cuda_configure(
             name = "%s_config_cuda" % name,
             environ = env,
             exec_properties = exec_properties,
         )
 
-        remote_nccl_configure(
+        # If you need to use non-hermetic CUDA, replace the call below with
+        # remote_nccl_configure.
+        hermetic_nccl_configure(
             name = "%s_config_nccl" % name,
             environ = env,
             exec_properties = exec_properties,
@@ -175,13 +185,15 @@ def sigbuild_tf_configs(name_container_map, env):
             "Pool": "default",
         }
 
-        remote_cuda_configure(
+        # TODO (ybaturina): DEPRECATED: replace with remote_cuda_configure for non-hermetic CUDA.
+        hermetic_cuda_configure(
             name = "%s_config_cuda" % name,
             environ = env,
             exec_properties = exec_properties,
         )
 
-        remote_nccl_configure(
+        # TODO (ybaturina): DEPRECATED: replace with remote_nccl_configure for non-hermetic NCCL.
+        hermetic_nccl_configure(
             name = "%s_config_nccl" % name,
             environ = env,
             exec_properties = exec_properties,
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index 77eea2ac869167..c041ad5d1abbbc 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -15,6 +15,7 @@ load("//tensorflow/tools/toolchains/clang6:repo.bzl", "clang6_configure")
 load("//tensorflow/tools/toolchains/embedded/arm-linux:arm_linux_toolchain_configure.bzl", "arm_linux_toolchain_configure")
 load("//tensorflow/tools/toolchains/remote:configure.bzl", "remote_execution_configure")
 load("//tensorflow/tools/toolchains/remote_config:configs.bzl", "initialize_rbe_configs")
+load("//third_party:cuda_repo.bzl", "cuda_distributives")
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 load("//third_party/absl:workspace.bzl", absl = "repo")
 load("//third_party/benchmark:workspace.bzl", benchmark = "repo")
@@ -29,7 +30,10 @@ load("//third_party/flatbuffers:workspace.bzl", flatbuffers = "repo")
 load("//third_party/FP16:workspace.bzl", FP16 = "repo")
 load("//third_party/gemmlowp:workspace.bzl", gemmlowp = "repo")
 load("//third_party/git:git_configure.bzl", "git_configure")
-load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
+
+# If you need to use non-hermetic CUDA, replace the line below with
+# load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
+load("//third_party/gpus:hermetic_cuda_configure.bzl", "hermetic_cuda_configure")
 load("//third_party/gpus:rocm_configure.bzl", "rocm_configure")
 load("//third_party/hexagon:workspace.bzl", hexagon_nn = "repo")
 load("//third_party/highwayhash:workspace.bzl", highwayhash = "repo")
@@ -41,7 +45,10 @@ load("//third_party/kissfft:workspace.bzl", kissfft = "repo")
 load("//third_party/libprotobuf_mutator:workspace.bzl", libprotobuf_mutator = "repo")
 load("//third_party/llvm:setup.bzl", "llvm_setup")
 load("//third_party/nasm:workspace.bzl", nasm = "repo")
-load("//third_party/nccl:nccl_configure.bzl", "nccl_configure")
+
+# If you need to use non-hermetic CUDA, replace the line below with
+# load("//third_party/nccl:nccl_configure.bzl", "nccl_configure")
+load("//third_party/nccl:hermetic_nccl_configure.bzl", "hermetic_nccl_configure")
 load("//third_party/opencl_headers:workspace.bzl", opencl_headers = "repo")
 load("//third_party/pasta:workspace.bzl", pasta = "repo")
 load("//third_party/py:python_configure.bzl", "python_configure")
@@ -103,9 +110,15 @@ def _tf_toolchains():
     # Note that we check the minimum bazel version in WORKSPACE.
     clang6_configure(name = "local_config_clang6")
     cc_download_clang_toolchain(name = "local_config_download_clang")
-    cuda_configure(name = "local_config_cuda")
+
+    # If you need to use non-hermetic CUDA, replace the line below with
+    # cuda_configure(name = "local_config_cuda")
+    hermetic_cuda_configure(name = "local_config_cuda")
     tensorrt_configure(name = "local_config_tensorrt")
-    nccl_configure(name = "local_config_nccl")
+
+    # If you need to use non-hermetic CUDA, replace the line below with
+    # nccl_configure(name = "local_config_nccl")
+    hermetic_nccl_configure(name = "local_config_nccl")
     git_configure(name = "local_config_git")
     syslibs_configure(name = "local_config_syslibs")
     python_configure(name = "local_config_python")
@@ -919,6 +932,28 @@ def _tf_repositories():
         version_conflict_policy = "pinned",
     )
 
+_CUDA_12_3_NCCL_WHEEL_DICT = {
+    "x86_64-unknown-linux-gnu": {
+        "url": "https://files.pythonhosted.org/packages/38/00/d0d4e48aef772ad5aebcf70b73028f88db6e5640b36c38e90445b7a57c45/nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl",
+        "sha256": "a9734707a2c96443331c1e48c717024aa6678a0e2a4cb66b2c364d18cee6b48d",
+    },
+    "aarch64-unknown-linux-gnu": {
+        "url": "https://files.pythonhosted.org/packages/c1/bb/d09dda47c881f9ff504afd6f9ca4f502ded6d8fc2f572cacc5e39da91c28/nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_aarch64.whl",
+        "sha256": "1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01",
+    },
+}
+
+_CUDA_12_1_NCCL_WHEEL_DICT = {
+    "x86_64-unknown-linux-gnu": {
+        "url": "https://files.pythonhosted.org/packages/38/00/d0d4e48aef772ad5aebcf70b73028f88db6e5640b36c38e90445b7a57c45/nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl",
+        "sha256": "a9734707a2c96443331c1e48c717024aa6678a0e2a4cb66b2c364d18cee6b48d",
+    },
+    "aarch64-unknown-linux-gnu": {
+        "url": "https://files.pythonhosted.org/packages/c1/bb/d09dda47c881f9ff504afd6f9ca4f502ded6d8fc2f572cacc5e39da91c28/nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_aarch64.whl",
+        "sha256": "1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01",
+    },
+}
+
 def workspace():
     # Check the bazel version before executing any repository rules, in case
     # those rules rely on the version we require here.
@@ -936,6 +971,10 @@ def workspace():
     # don't already exist (at least if the external repository macros were
     # written according to common practice to query native.existing_rule()).
     _tf_repositories()
+    cuda_distributives(cuda_nccl_wheel_dict = {
+        "12.3.2": _CUDA_12_3_NCCL_WHEEL_DICT,
+        "12.1.1": _CUDA_12_1_NCCL_WHEEL_DICT,
+    })
 
     tfrt_dependencies()
 
diff --git a/tensorflow/workspace3.bzl b/tensorflow/workspace3.bzl
index d7b32f01c7144d..3632d49ade844f 100644
--- a/tensorflow/workspace3.bzl
+++ b/tensorflow/workspace3.bzl
@@ -1,10 +1,33 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("//third_party:cuda_redist_json_repo.bzl", "cuda_redist_json")
 load("//third_party:repo.bzl", "tf_vendored")
 load("//third_party/llvm:workspace.bzl", llvm = "repo")
 load("//third_party/tf_runtime:workspace.bzl", tf_runtime = "repo")
 
+_CUDA_REDIST_JSON_DICT = {
+    "12.1.1": [
+        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.1.1.json",
+        "bafea3cb83a4cf5c764eeedcaac0040d0d3c5db3f9a74550da0e7b6ac24d378c",
+    ],
+    "12.3.2": [
+        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.3.2.json",
+        "1b6eacf335dd49803633fed53ef261d62c193e5a56eee5019e7d2f634e39e7ef",
+    ],
+}
+
+_CUDNN_REDIST_JSON_DICT = {
+    "8.6": [
+        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_8.6.0.json",
+        "7f6f50bed4fd8216dc10d6ef505771dc0ecc99cce813993ab405cb507a21d51d",
+    ],
+    "8.9.7.29": [
+        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_8.9.7.29.json",
+        "a0734f26f068522464fa09b2f2c186dfbe6ad7407a88ea0c50dd331f0c3389ec",
+    ],
+}
+
 def workspace():
     tf_vendored(name = "local_xla", relpath = "third_party/xla")
     tf_vendored(name = "local_tsl", relpath = "third_party/xla/third_party/tsl")
@@ -62,6 +85,13 @@ def workspace():
     # but provides a script for setting up build rules via overlays.
     llvm("llvm-raw")
 
+    # Load JSON files for CUDA and cuDNN distribution versions.
+    cuda_redist_json(
+        name = "cuda_redist_json",
+        cuda_json_dict = _CUDA_REDIST_JSON_DICT,
+        cudnn_json_dict = _CUDNN_REDIST_JSON_DICT,
+    )
+
 # Alias so it can be loaded without assigning to a different symbol to prevent
 # shadowing previous loads and trigger a buildifier warning.
 tf_workspace3 = workspace
diff --git a/third_party/cuda_redist_json_repo.bzl b/third_party/cuda_redist_json_repo.bzl
new file mode 100644
index 00000000000000..76941dd74c9488
--- /dev/null
+++ b/third_party/cuda_redist_json_repo.bzl
@@ -0,0 +1,110 @@
+# Copyright 2024 The TensorFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for defining CUDA and cuDNN JSON files with distributives versions."""
+
+load("//third_party:repo.bzl", "tf_mirror_urls")
+
+_DEFAULT_CUDA_VERSION = "12.3.2"
+_DEFAULT_CUDNN_VERSION = "8.9.7.29"
+
+def _get_env_var(ctx, name):
+    if name in ctx.os.environ:
+        return ctx.os.environ[name]
+    else:
+        return None
+
+def _cuda_redist_json_impl(repository_ctx):
+    cuda_version = _get_env_var(repository_ctx, "TF_CUDA_VERSION")
+    cudnn_version = _get_env_var(repository_ctx, "TF_CUDNN_VERSION")
+    supported_cuda_versions = repository_ctx.attr.cuda_json_dict.keys()
+    if cuda_version and (cuda_version not in supported_cuda_versions):
+        if cuda_version in ["12", "12.3"]:
+            cuda_version = _DEFAULT_CUDA_VERSION
+        else:
+            fail(
+                ("The supported CUDA versions are {supported_versions}." +
+                 " Please provide a supported version in TF_CUDA_VERSION" +
+                 " environment variable or add JSON URL for" +
+                 " CUDA version={version}.")
+                    .format(
+                    supported_versions = supported_cuda_versions,
+                    version = cuda_version,
+                ),
+            )
+    supported_cudnn_versions = repository_ctx.attr.cudnn_json_dict.keys()
+    if cudnn_version and (cudnn_version not in supported_cudnn_versions):
+        if cudnn_version in ["8", "8.9"]:
+            cudnn_version = _DEFAULT_CUDNN_VERSION
+        else:
+            fail(
+                ("The supported CUDNN versions are {supported_versions}." +
+                 " Please provide a supported version in TF_CUDNN_VERSION" +
+                 " environment variable or add JSON URL for" +
+                 " CUDNN version={version}.")
+                    .format(
+                    supported_versions = supported_cudnn_versions,
+                    version = cudnn_version,
+                ),
+            )
+    cuda_distributives = "{}"
+    cudnn_distributives = "{}"
+    if cuda_version:
+        (url, sha256) = repository_ctx.attr.cuda_json_dict[cuda_version]
+        json_file_name = "redistrib_cuda_%s.json" % cuda_version
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            sha256 = sha256,
+            output = json_file_name,
+        )
+        cuda_distributives = repository_ctx.read(repository_ctx.path(json_file_name))
+    if cudnn_version:
+        (url, sha256) = repository_ctx.attr.cudnn_json_dict[cudnn_version]
+        json_file_name = "redistrib_cudnn_%s.json" % cudnn_version
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            sha256 = sha256,
+            output = json_file_name,
+        )
+        cudnn_distributives = repository_ctx.read(repository_ctx.path(json_file_name))
+
+    repository_ctx.file(
+        "build_defs.bzl",
+        """def get_cuda_distributives():
+               return {cuda_distributives}
+
+def get_cudnn_distributives():
+    return {cudnn_distributives}
+""".format(cuda_distributives = cuda_distributives, cudnn_distributives = cudnn_distributives),
+    )
+    repository_ctx.file(
+        "BUILD",
+        "",
+    )
+
+_cuda_redist_json = repository_rule(
+    implementation = _cuda_redist_json_impl,
+    attrs = {
+        "cuda_json_dict": attr.string_list_dict(mandatory = True),
+        "cudnn_json_dict": attr.string_list_dict(mandatory = True),
+    },
+    environ = ["TF_CUDA_VERSION", "TF_CUDNN_VERSION"],
+)
+
+def cuda_redist_json(name, cuda_json_dict, cudnn_json_dict):
+    _cuda_redist_json(
+        name = name,
+        cuda_json_dict = cuda_json_dict,
+        cudnn_json_dict = cudnn_json_dict,
+    )
diff --git a/third_party/cuda_repo.bzl b/third_party/cuda_repo.bzl
new file mode 100644
index 00000000000000..0dcda26cba3a23
--- /dev/null
+++ b/third_party/cuda_repo.bzl
@@ -0,0 +1,327 @@
+# Copyright 2024 The TensorFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for defining Google ML CUDA dependencies."""
+
+load("@cuda_redist_json//:build_defs.bzl", "get_cuda_distributives", "get_cudnn_distributives")
+load("//third_party:repo.bzl", "tf_mirror_urls")
+
+_DEFAULT_CUDA_VERSION = "12.3.2"
+_OS_ARCH_DICT = {
+    "amd64": "x86_64-unknown-linux-gnu",
+    "aarch64": "aarch64-unknown-linux-gnu",
+}
+_REDIST_ARCH_DICT = {
+    "linux-x86_64": "x86_64-unknown-linux-gnu",
+    "linux-sbsa": "aarch64-unknown-linux-gnu",
+}
+
+_CUDA_DIST_PATH_PREFIX = "https://developer.download.nvidia.com/compute/cuda/redist/"
+_CUDNN_DIST_PATH_PREFIX = "https://developer.download.nvidia.com/compute/cudnn/redist/"
+
+def _get_env_var(ctx, name):
+    if name in ctx.os.environ:
+        return ctx.os.environ[name]
+    else:
+        return None
+
+def _get_archive_name(url, archive_suffix = ".tar.xz"):
+    last_slash_index = url.rfind("/")
+    return url[last_slash_index + 1:-len(archive_suffix)]
+
+def _cuda_http_archive_impl(repository_ctx):
+    cuda_or_cudnn_version = None
+    dist_version = ""
+    cuda_version = _get_env_var(repository_ctx, "TF_CUDA_VERSION")
+    cudnn_version = _get_env_var(repository_ctx, "TF_CUDNN_VERSION")
+    if repository_ctx.attr.is_cudnn_dist:
+        cuda_or_cudnn_version = cudnn_version
+    else:
+        cuda_or_cudnn_version = cuda_version
+    if cuda_or_cudnn_version:
+        # Download archive only when GPU config is used.
+        dist_version = repository_ctx.attr.dist_version
+        arch = _OS_ARCH_DICT[repository_ctx.os.arch]
+        if arch not in repository_ctx.attr.relative_url_dict.keys():
+            (relative_url, sha256) = repository_ctx.attr.relative_url_dict["cuda{version}_{arch}" \
+                .format(
+                version = cuda_version.split(".")[0],
+                arch = arch,
+            )]
+        else:
+            (relative_url, sha256) = repository_ctx.attr.relative_url_dict[arch]
+        url = (repository_ctx.attr.cudnn_dist_path_prefix if repository_ctx.attr.is_cudnn_dist else repository_ctx.attr.cuda_dist_path_prefix) + relative_url
+
+        archive_name = _get_archive_name(url)
+
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            output = archive_name + repository_ctx.attr.extension,
+            sha256 = sha256,
+        )
+        repository_ctx.extract(
+            archive = archive_name + repository_ctx.attr.extension,
+            stripPrefix = repository_ctx.attr.override_strip_prefix if repository_ctx.attr.override_strip_prefix else archive_name,
+        )
+    if repository_ctx.attr.build_template:
+        version = dist_version.split(".")[0] if dist_version else ""
+        repository_ctx.file("version.txt", version)
+        repository_ctx.template(
+            "BUILD",
+            repository_ctx.attr.build_template,
+            {"%{version}": version},
+        )
+    else:
+        repository_ctx.file(
+            "BUILD",
+            repository_ctx.read(repository_ctx.attr.build_file),
+        )
+
+_cuda_http_archive = repository_rule(
+    implementation = _cuda_http_archive_impl,
+    attrs = {
+        "dist_version": attr.string(mandatory = True),
+        "relative_url_dict": attr.string_list_dict(mandatory = True),
+        "build_template": attr.label(),
+        "build_file": attr.label(),
+        "is_cudnn_dist": attr.bool(),
+        "override_strip_prefix": attr.string(),
+        "cudnn_dist_path_prefix": attr.string(default = _CUDNN_DIST_PATH_PREFIX),
+        "cuda_dist_path_prefix": attr.string(default = _CUDA_DIST_PATH_PREFIX),
+        "extension": attr.string(default = ".tar.xz"),
+    },
+    environ = ["TF_CUDA_VERSION", "TF_CUDNN_VERSION"],
+)
+
+def cuda_http_archive(name, dist_version, relative_url_dict, **kwargs):
+    _cuda_http_archive(
+        name = name,
+        dist_version = dist_version,
+        relative_url_dict = relative_url_dict,
+        **kwargs
+    )
+
+def _cuda_wheel_impl(repository_ctx):
+    cuda_version = _get_env_var(repository_ctx, "TF_CUDA_VERSION")
+    if cuda_version in ["12", "12.3"]:
+        cuda_version = _DEFAULT_CUDA_VERSION
+    if cuda_version:
+        # Download archive only when GPU config is used.
+        arch = _OS_ARCH_DICT[repository_ctx.os.arch]
+        dict_key = "{cuda_version}-{arch}".format(
+            cuda_version = cuda_version,
+            arch = arch,
+        )
+        supported_versions = repository_ctx.attr.url_dict.keys()
+        if dict_key not in supported_versions:
+            fail(
+                ("The supported NCCL versions are {supported_versions}." +
+                 " Please provide a supported CUDA version in TF_CUDA_VERSION" +
+                 " environment variable or add NCCL distributive for" +
+                 " CUDA version={version}, OS={arch}.")
+                    .format(
+                    supported_versions = supported_versions,
+                    version = cuda_version,
+                    arch = arch,
+                ),
+            )
+        sha256 = repository_ctx.attr.sha256_dict[dict_key]
+        url = repository_ctx.attr.url_dict[dict_key]
+
+        archive_name = _get_archive_name(url, archive_suffix = ".whl")
+
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            output = archive_name + repository_ctx.attr.extension,
+            sha256 = sha256,
+        )
+        repository_ctx.extract(
+            archive = archive_name + repository_ctx.attr.extension,
+            stripPrefix = repository_ctx.attr.strip_prefix,
+        )
+
+    repository_ctx.file(
+        "BUILD",
+        repository_ctx.read(repository_ctx.attr.build_file),
+    )
+
+_cuda_wheel = repository_rule(
+    implementation = _cuda_wheel_impl,
+    attrs = {
+        "sha256_dict": attr.string_dict(mandatory = True),
+        "url_dict": attr.string_dict(mandatory = True),
+        "build_file": attr.label(),
+        "strip_prefix": attr.string(),
+        "extension": attr.string(default = ".zip"),
+    },
+    environ = ["TF_CUDA_VERSION"],
+)
+
+def cuda_wheel(name, sha256_dict, url_dict, **kwargs):
+    _cuda_wheel(
+        name = name,
+        sha256_dict = sha256_dict,
+        url_dict = url_dict,
+        **kwargs
+    )
+
+def _get_relative_url_dict(dist_info):
+    relative_url_dict = {}
+    for arch in _REDIST_ARCH_DICT.keys():
+        # CUDNN JSON might contain paths for each CUDA version.
+        if "relative_path" not in dist_info[arch]:
+            for cuda_version, data in dist_info[arch].items():
+                relative_url_dict["{cuda_version}_{arch}" \
+                    .format(
+                    cuda_version = cuda_version,
+                    arch = _REDIST_ARCH_DICT[arch],
+                )] = [data["relative_path"], data["sha256"]]
+        else:
+            relative_url_dict[_REDIST_ARCH_DICT[arch]] = [
+                dist_info[arch]["relative_path"],
+                dist_info[arch]["sha256"],
+            ]
+    return relative_url_dict
+
+def _get_cuda_archive(
+        repo_name,
+        dist_dict,
+        dist_name,
+        build_file = None,
+        build_template = None,
+        is_cudnn_dist = False):
+    if dist_name in dist_dict.keys():
+        return cuda_http_archive(
+            name = repo_name,
+            dist_version = dist_dict[dist_name]["version"],
+            build_file = Label(build_file) if build_file else None,
+            build_template = Label(build_template) if build_template else None,
+            relative_url_dict = _get_relative_url_dict(dist_dict[dist_name]),
+            is_cudnn_dist = is_cudnn_dist,
+        )
+    else:
+        return cuda_http_archive(
+            name = repo_name,
+            dist_version = "",
+            build_file = Label(build_file) if build_file else None,
+            build_template = Label(build_template) if build_template else None,
+            relative_url_dict = {"": []},
+            is_cudnn_dist = is_cudnn_dist,
+        )
+
+def cuda_distributives(cuda_nccl_wheel_dict):
+    nccl_artifacts_dict = {"sha256_dict": {}, "url_dict": {}}
+    for cuda_version, nccl_wheel_info in cuda_nccl_wheel_dict.items():
+        for arch in _OS_ARCH_DICT.values():
+            if arch in nccl_wheel_info.keys():
+                cuda_version_to_arch_key = "%s-%s" % (cuda_version, arch)
+                nccl_artifacts_dict["sha256_dict"][cuda_version_to_arch_key] = nccl_wheel_info[arch]["sha256"]
+                nccl_artifacts_dict["url_dict"][cuda_version_to_arch_key] = nccl_wheel_info[arch]["url"]
+
+    cuda_wheel(
+        name = "cuda_nccl",
+        sha256_dict = nccl_artifacts_dict["sha256_dict"],
+        url_dict = nccl_artifacts_dict["url_dict"],
+        build_file = Label("//third_party/gpus/cuda:cuda_nccl.BUILD"),
+        strip_prefix = "nvidia/nccl",
+    )
+
+    cuda_distributives = get_cuda_distributives()
+    cudnn_distributives = get_cudnn_distributives()
+
+    _get_cuda_archive(
+        repo_name = "cuda_cccl",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_cccl",
+        build_file = "//third_party/gpus/cuda:cuda_cccl.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cublas",
+        dist_dict = cuda_distributives,
+        dist_name = "libcublas",
+        build_template = "//third_party/gpus/cuda:cuda_cublas.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cudart",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_cudart",
+        build_template = "//third_party/gpus/cuda:cuda_cudart.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cudnn",
+        dist_dict = cudnn_distributives,
+        dist_name = "cudnn",
+        build_template = "//third_party/gpus/cuda:cuda_cudnn.BUILD.tpl",
+        is_cudnn_dist = True,
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cufft",
+        dist_dict = cuda_distributives,
+        dist_name = "libcufft",
+        build_template = "//third_party/gpus/cuda:cuda_cufft.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cupti",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_cupti",
+        build_template = "//third_party/gpus/cuda:cuda_cupti.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_curand",
+        dist_dict = cuda_distributives,
+        dist_name = "libcurand",
+        build_template = "//third_party/gpus/cuda:cuda_curand.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cusolver",
+        dist_dict = cuda_distributives,
+        dist_name = "libcusolver",
+        build_template = "//third_party/gpus/cuda:cuda_cusolver.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cusparse",
+        dist_dict = cuda_distributives,
+        dist_name = "libcusparse",
+        build_template = "//third_party/gpus/cuda:cuda_cusparse.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvcc",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvcc",
+        build_file = "//third_party/gpus/cuda:cuda_nvcc.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvjitlink",
+        dist_dict = cuda_distributives,
+        dist_name = "libnvjitlink",
+        build_template = "//third_party/gpus/cuda:cuda_nvjitlink.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvml",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvml_dev",
+        build_file = "//third_party/gpus/cuda:cuda_nvml.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvprune",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvprune",
+        build_file = "//third_party/gpus/cuda:cuda_nvprune.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvtx",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvtx",
+        build_file = "//third_party/gpus/cuda:cuda_nvtx.BUILD",
+    )
diff --git a/third_party/gpus/compiler_common_tools.bzl b/third_party/gpus/compiler_common_tools.bzl
new file mode 100644
index 00000000000000..bd07f49ec457bb
--- /dev/null
+++ b/third_party/gpus/compiler_common_tools.bzl
@@ -0,0 +1,174 @@
+"""Common compiler functions. """
+
+load(
+    "//third_party/remote_config:common.bzl",
+    "err_out",
+    "raw_exec",
+    "realpath",
+)
+
+def to_list_of_strings(elements):
+    """Convert the list of ["a", "b", "c"] into '"a", "b", "c"'.
+
+    This is to be used to put a list of strings into the bzl file templates
+    so it gets interpreted as list of strings in Starlark.
+
+    Args:
+      elements: list of string elements
+
+    Returns:
+      single string of elements wrapped in quotes separated by a comma."""
+    quoted_strings = ["\"" + element + "\"" for element in elements]
+    return ", ".join(quoted_strings)
+
+_INC_DIR_MARKER_BEGIN = "#include <...>"
+
+# OSX add " (framework directory)" at the end of line, strip it.
+_OSX_FRAMEWORK_SUFFIX = " (framework directory)"
+_OSX_FRAMEWORK_SUFFIX_LEN = len(_OSX_FRAMEWORK_SUFFIX)
+
+# TODO(dzc): Once these functions have been factored out of Bazel's
+# cc_configure.bzl, load them from @bazel_tools instead.
+def _cxx_inc_convert(path):
+    """Convert path returned by cc -E xc++ in a complete path."""
+    path = path.strip()
+    if path.endswith(_OSX_FRAMEWORK_SUFFIX):
+        path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
+    return path
+
+def _normalize_include_path(repository_ctx, path):
+    """Normalizes include paths before writing them to the crosstool.
+
+      If path points inside the 'crosstool' folder of the repository, a relative
+      path is returned.
+      If path points outside the 'crosstool' folder, an absolute path is returned.
+    """
+    path = str(repository_ctx.path(path))
+    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
+
+    if path.startswith(crosstool_folder):
+        # We drop the path to "$REPO/crosstool" and a trailing path separator.
+        return path[len(crosstool_folder) + 1:]
+    return path
+
+def _is_compiler_option_supported(repository_ctx, cc, option):
+    """Checks that `option` is supported by the C compiler. Doesn't %-escape the option."""
+    result = repository_ctx.execute([
+        cc,
+        option,
+        "-o",
+        "/dev/null",
+        "-c",
+        str(repository_ctx.path("tools/cpp/empty.cc")),
+    ])
+    return result.stderr.find(option) == -1
+
+def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp, tf_sys_root):
+    """Compute the list of default C or C++ include directories."""
+    if lang_is_cpp:
+        lang = "c++"
+    else:
+        lang = "c"
+    sysroot = []
+    if tf_sys_root:
+        sysroot += ["--sysroot", tf_sys_root]
+    result = raw_exec(repository_ctx, [cc, "-E", "-x" + lang, "-", "-v"] +
+                                      sysroot)
+    stderr = err_out(result)
+    index1 = stderr.find(_INC_DIR_MARKER_BEGIN)
+    if index1 == -1:
+        return []
+    index1 = stderr.find("\n", index1)
+    if index1 == -1:
+        return []
+    index2 = stderr.rfind("\n ")
+    if index2 == -1 or index2 < index1:
+        return []
+    index2 = stderr.find("\n", index2 + 1)
+    if index2 == -1:
+        inc_dirs = stderr[index1 + 1:]
+    else:
+        inc_dirs = stderr[index1 + 1:index2].strip()
+
+    print_resource_dir_supported = _is_compiler_option_supported(
+        repository_ctx,
+        cc,
+        "-print-resource-dir",
+    )
+
+    if print_resource_dir_supported:
+        resource_dir = repository_ctx.execute(
+            [cc, "-print-resource-dir"],
+        ).stdout.strip() + "/share"
+        inc_dirs += "\n" + resource_dir
+
+    compiler_includes = [
+        _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
+        for p in inc_dirs.split("\n")
+    ]
+
+    # The compiler might be on a symlink, e.g. /symlink -> /opt/gcc
+    # The above keeps only the resolved paths to the default includes (e.g. /opt/gcc/include/c++/11)
+    # but Bazel might encounter either (usually reported by the compiler)
+    # especially when a compiler wrapper (e.g. ccache) is used.
+    # So we need to also include paths where symlinks are not resolved.
+
+    # Try to find real path to CC installation to "see through" compiler wrappers
+    # GCC has the path to g++
+    index1 = result.stderr.find("COLLECT_GCC=")
+    if index1 != -1:
+        index1 = result.stderr.find("=", index1)
+        index2 = result.stderr.find("\n", index1)
+        cc_topdir = repository_ctx.path(result.stderr[index1 + 1:index2]).dirname.dirname
+    else:
+        # Clang has the directory
+        index1 = result.stderr.find("InstalledDir: ")
+        if index1 != -1:
+            index1 = result.stderr.find(" ", index1)
+            index2 = result.stderr.find("\n", index1)
+            cc_topdir = repository_ctx.path(result.stderr[index1 + 1:index2]).dirname
+        else:
+            # Fallback to the CC path
+            cc_topdir = repository_ctx.path(cc).dirname.dirname
+
+    # We now have the compiler installation prefix, e.g. /symlink/gcc
+    # And the resolved installation prefix, e.g. /opt/gcc
+    cc_topdir_resolved = str(realpath(repository_ctx, cc_topdir)).strip()
+    cc_topdir = str(cc_topdir).strip()
+
+    # If there is (any!) symlink involved we add paths where the unresolved installation prefix is kept.
+    # e.g. [/opt/gcc/include/c++/11, /opt/gcc/lib/gcc/x86_64-linux-gnu/11/include, /other/path]
+    # adds [/symlink/include/c++/11, /symlink/lib/gcc/x86_64-linux-gnu/11/include]
+    if cc_topdir_resolved != cc_topdir:
+        unresolved_compiler_includes = [
+            cc_topdir + inc[len(cc_topdir_resolved):]
+            for inc in compiler_includes
+            if inc.startswith(cc_topdir_resolved)
+        ]
+        compiler_includes = compiler_includes + unresolved_compiler_includes
+    return compiler_includes
+
+def get_cxx_inc_directories(repository_ctx, cc, tf_sys_root):
+    """Compute the list of default C and C++ include directories."""
+
+    # For some reason `clang -xc` sometimes returns include paths that are
+    # different from the ones from `clang -xc++`. (Symlink and a dir)
+    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
+    includes_cpp = _get_cxx_inc_directories_impl(
+        repository_ctx,
+        cc,
+        True,
+        tf_sys_root,
+    )
+    includes_c = _get_cxx_inc_directories_impl(
+        repository_ctx,
+        cc,
+        False,
+        tf_sys_root,
+    )
+
+    return includes_cpp + [
+        inc
+        for inc in includes_c
+        if inc not in includes_cpp
+    ]
diff --git a/third_party/gpus/crosstool/BUILD.tpl b/third_party/gpus/crosstool/BUILD.tpl
index 8eda7a1cf6ac2b..b9553d9b99ecfe 100644
--- a/third_party/gpus/crosstool/BUILD.tpl
+++ b/third_party/gpus/crosstool/BUILD.tpl
@@ -2,6 +2,7 @@
 # Update cuda_configure.bzl#verify_build_defines when adding new variables.
 
 load(":cc_toolchain_config.bzl", "cc_toolchain_config")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
 licenses(["restricted"])
 
@@ -133,9 +134,17 @@ filegroup(
     srcs = [],
 )
 
+filegroup(
+    name = "cuda_nvcc_files",
+    srcs = %{cuda_nvcc_files},
+)
+
 filegroup(
     name = "crosstool_wrapper_driver_is_not_gcc",
-    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
+    srcs = [
+      ":cuda_nvcc_files",
+      ":clang/bin/crosstool_wrapper_driver_is_not_gcc"
+    ],
 )
 
 filegroup(
diff --git a/third_party/gpus/cuda/BUILD.hermetic.tpl b/third_party/gpus/cuda/BUILD.hermetic.tpl
new file mode 100644
index 00000000000000..1c00f1c5e32916
--- /dev/null
+++ b/third_party/gpus/cuda/BUILD.hermetic.tpl
@@ -0,0 +1,291 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("@bazel_skylib//lib:selects.bzl", "selects")
+load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
+
+licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+
+package(default_visibility = ["//visibility:public"])
+
+# Config setting whether TensorFlow is built with CUDA support using clang.
+#
+# TODO(b/174244321), DEPRECATED: this target will be removed when all users
+# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_clang.
+selects.config_setting_group(
+    name = "using_clang",
+    match_all = [
+        "@local_config_cuda//:is_cuda_enabled",
+        "@local_config_cuda//:is_cuda_compiler_clang",
+    ],
+)
+
+# Config setting whether TensorFlow is built with CUDA support using nvcc.
+#
+# TODO(b/174244321), DEPRECATED: this target will be removed when all users
+# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_nvcc.
+selects.config_setting_group(
+    name = "using_nvcc",
+    match_all = [
+        "@local_config_cuda//:is_cuda_enabled",
+        "@local_config_cuda//:is_cuda_compiler_nvcc",
+    ],
+)
+
+# Equivalent to using_clang && -c opt.
+selects.config_setting_group(
+    name = "using_clang_opt",
+    match_all = [
+        ":using_clang",
+        ":_opt",
+    ],
+)
+
+config_setting(
+    name = "_opt",
+    values = {"compilation_mode": "opt"},
+)
+
+# Provides CUDA headers for '#include "third_party/gpus/cuda/include/cuda.h"'
+# All clients including TensorFlow should use these directives.
+cc_library(
+    name = "cuda_headers",
+    hdrs = [
+        "cuda/cuda_config.h",
+    ],
+    include_prefix = "third_party/gpus",
+    includes = [
+        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
+    ],
+    deps = [":cudart_headers",
+            ":cublas_headers",
+            ":cccl_headers",
+            ":nvtx_headers",
+            ":nvcc_headers",
+            ":nvjitlink_headers",
+            ":cusolver_headers",
+            ":cufft_headers",
+            ":cusparse_headers",
+            ":curand_headers",
+            ":cupti_headers",
+            ":nvml_headers"],
+)
+
+cc_library(
+    name = "cudart_static",
+    srcs = ["@cuda_cudart//:static"],
+    linkopts = [
+        "-ldl",
+        "-lpthread",
+        %{cudart_static_linkopt}
+    ],
+)
+
+alias(
+  name = "cuda_driver",
+  actual = "@cuda_cudart//:cuda_driver",
+)
+
+alias(
+  name = "cudart_headers",
+  actual = "@cuda_cudart//:headers",
+)
+
+alias(
+  name = "cudart",
+  actual = "@cuda_cudart//:cudart",
+)
+
+alias(
+  name = "nvjitlink_headers",
+  actual = "@cuda_nvjitlink//:headers",
+)
+
+alias(
+  name = "nvjitlink",
+  actual = "@cuda_nvjitlink//:nvjitlink",
+)
+
+alias(
+  name = "nvtx_headers",
+  actual = "@cuda_nvtx//:headers",
+)
+
+alias(
+  name = "nvml_headers",
+  actual = "@cuda_nvml//:headers",
+)
+
+alias(
+  name = "nvcc_headers",
+  actual = "@cuda_nvcc//:headers",
+)
+
+alias(
+  name = "cccl_headers",
+  actual = "@cuda_cccl//:headers",
+)
+
+alias(
+  name = "cublas_headers",
+  actual = "@cuda_cublas//:headers",
+)
+
+alias(
+  name = "cusolver_headers",
+  actual = "@cuda_cusolver//:headers",
+)
+
+alias(
+  name = "cufft_headers",
+  actual = "@cuda_cufft//:headers",
+)
+
+alias(
+  name = "cusparse_headers",
+  actual = "@cuda_cusparse//:headers",
+)
+
+alias(
+  name = "curand_headers",
+  actual = "@cuda_curand//:headers",
+)
+
+alias(
+  name = "cublas",
+  actual = "@cuda_cublas//:cublas",
+)
+
+alias(
+  name = "cublasLt",
+  actual = "@cuda_cublas//:cublasLt",
+)
+
+alias(
+  name = "cusolver",
+  actual = "@cuda_cusolver//:cusolver",
+)
+
+alias(
+  name = "cudnn",
+  actual = "@cuda_cudnn//:cudnn",
+)
+
+alias(
+  name = "cudnn_ops_infer",
+  actual = "@cuda_cudnn//:cudnn_ops_infer",
+)
+
+alias(
+  name = "cudnn_cnn_infer",
+  actual = "@cuda_cudnn//:cudnn_cnn_infer",
+)
+
+alias(
+  name = "cudnn_ops_train",
+  actual = "@cuda_cudnn//:cudnn_ops_train",
+)
+
+alias(
+  name = "cudnn_cnn_train",
+  actual = "@cuda_cudnn//:cudnn_cnn_train",
+)
+
+alias(
+  name = "cudnn_adv_infer",
+  actual = "@cuda_cudnn//:cudnn_adv_infer",
+)
+
+alias(
+  name = "cudnn_adv_train",
+  actual = "@cuda_cudnn//:cudnn_adv_train",
+)
+alias(
+  name = "cudnn_header",
+  actual = "@cuda_cudnn//:headers",
+)
+
+alias(
+  name = "cufft",
+  actual = "@cuda_cufft//:cufft",
+)
+
+alias(
+  name = "curand",
+  actual = "@cuda_curand//:curand",
+)
+
+cc_library(
+    name = "cuda",
+    deps = [
+        ":cublas",
+        ":cublasLt",
+        ":cuda_headers",
+        ":cudart",
+        ":cudnn",
+        ":cufft",
+        ":curand",
+    ],
+)
+
+alias(
+    name = "cub_headers",
+    actual = "%{cub_actual}",
+)
+
+alias(
+  name = "cupti_headers",
+  actual = "@cuda_cupti//:headers",
+)
+
+alias(
+  name = "cupti_dsos",
+  actual = "@cuda_cupti//:cupti",
+)
+
+alias(
+  name = "cusparse",
+  actual = "@cuda_cusparse//:cusparse",
+)
+
+cc_library(
+    name = "libdevice_root",
+    data = ["@cuda_nvcc//:nvvm"],
+)
+
+bzl_library(
+    name = "build_defs_bzl",
+    srcs = ["build_defs.bzl"],
+    deps = [
+        "@bazel_skylib//lib:selects",
+    ],
+)
+
+py_library(
+    name = "cuda_config_py",
+    srcs = ["cuda/cuda_config.py"],
+)
+
+# Config setting whether TensorFlow is built with hermetic CUDA.
+alias(
+    name = "hermetic_cuda_tools",
+    actual = "@local_config_cuda//:is_cuda_enabled",
+)
+
+# Flag indicating if we should include hermetic CUDA libs.
+bool_flag(
+    name = "include_hermetic_cuda_libs",
+    build_setting_default = True,
+)
+
+config_setting(
+    name = "hermetic_cuda_libs",
+    flag_values = {":include_hermetic_cuda_libs": "True"},
+)
+
+selects.config_setting_group(
+    name = "hermetic_cuda_tools_and_libs",
+    match_all = [
+        ":hermetic_cuda_libs",
+        ":hermetic_cuda_tools"
+    ],
+)
+
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index 90a18b90de048c..a4264cc14890e5 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -1,6 +1,7 @@
 load(":build_defs.bzl", "cuda_header_library")
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("@bazel_skylib//lib:selects.bzl", "selects")
+load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 
 licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
 
@@ -144,7 +145,6 @@ cc_library(
     name = "cusolver",
     srcs = ["cuda/lib/%{cusolver_lib}"],
     data = ["cuda/lib/%{cusolver_lib}"],
-    linkopts = ["-lgomp"],
     linkstatic = 1,
 )
 
@@ -220,7 +220,6 @@ cc_library(
     name = "cusparse",
     srcs = ["cuda/lib/%{cusparse_lib}"],
     data = ["cuda/lib/%{cusparse_lib}"],
-    linkopts = ["-lgomp"],
     linkstatic = 1,
 )
 
@@ -242,4 +241,29 @@ py_library(
     srcs = ["cuda/cuda_config.py"],
 )
 
+# Config setting whether TensorFlow is built with hermetic CUDA.
+alias(
+    name = "hermetic_cuda_tools",
+    actual = "@local_config_cuda//:is_cuda_enabled",
+)
+
+# Flag indicating if we should include hermetic CUDA libs.
+bool_flag(
+    name = "include_hermetic_cuda_libs",
+    build_setting_default = False,
+)
+
+config_setting(
+    name = "hermetic_cuda_libs",
+    flag_values = {":include_hermetic_cuda_libs": "True"},
+)
+
+selects.config_setting_group(
+    name = "hermetic_cuda_tools_and_libs",
+    match_all = [
+        ":hermetic_cuda_libs",
+        ":hermetic_cuda_tools"
+    ],
+)
+
 %{copy_rules}
diff --git a/third_party/gpus/cuda/cuda_cccl.BUILD b/third_party/gpus/cuda/cuda_cccl.BUILD
new file mode 100644
index 00000000000000..9823f1d871ed53
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_cccl.BUILD
@@ -0,0 +1,12 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_cublas.BUILD.tpl b/third_party/gpus/cuda/cuda_cublas.BUILD.tpl
new file mode 100644
index 00000000000000..d5766c971a50ff
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_cublas.BUILD.tpl
@@ -0,0 +1,33 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cublas",
+    hdrs = [":headers"],
+    shared_library = "lib/libcublas.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cublasLt",
+    hdrs = [":headers"],
+    shared_library = "lib/libcublasLt.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = [
+      "include/cublas.h", 
+      "include/cublas_v2.h", 
+      "include/cublas_api.h", 
+      "include/cublasLt.h"
+    ],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_cudart.BUILD.tpl b/third_party/gpus/cuda/cuda_cudart.BUILD.tpl
new file mode 100644
index 00000000000000..08655e7819156c
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_cudart.BUILD.tpl
@@ -0,0 +1,34 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+filegroup(
+    name = "static",
+    srcs = ["lib/libcudart_static.a"],
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
+
+cc_import(
+    name = "cuda_driver",
+    shared_library = "lib/stubs/libcuda.so",
+)
+
+cc_import(
+    name = "cudart",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudart.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_cudnn.BUILD.tpl b/third_party/gpus/cuda/cuda_cudnn.BUILD.tpl
new file mode 100644
index 00000000000000..98da6e69cbe644
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_cudnn.BUILD.tpl
@@ -0,0 +1,65 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cudnn",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_ops_infer",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_ops_infer.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_cnn_infer",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_cnn_infer.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_ops_train",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_ops_train.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_cnn_train",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_cnn_train.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_adv_infer",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_adv_infer.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_adv_train",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_adv_train.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cudnn",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_cufft.BUILD.tpl b/third_party/gpus/cuda/cuda_cufft.BUILD.tpl
new file mode 100644
index 00000000000000..6836814dc9b622
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_cufft.BUILD.tpl
@@ -0,0 +1,23 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cufft",
+    hdrs = [":headers"],
+    shared_library = "lib/libcufft.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_cupti.BUILD.tpl b/third_party/gpus/cuda/cuda_cupti.BUILD.tpl
new file mode 100644
index 00000000000000..772386d723649f
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_cupti.BUILD.tpl
@@ -0,0 +1,23 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cupti",
+    hdrs = [":headers"],
+    shared_library = "lib/libcupti.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/extras/CUPTI/include",
+    includes = ["include/"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_curand.BUILD.tpl b/third_party/gpus/cuda/cuda_curand.BUILD.tpl
new file mode 100644
index 00000000000000..c98ded26f4b907
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_curand.BUILD.tpl
@@ -0,0 +1,23 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "curand",
+    hdrs = [":headers"],
+    shared_library = "lib/libcurand.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_cusolver.BUILD.tpl b/third_party/gpus/cuda/cuda_cusolver.BUILD.tpl
new file mode 100644
index 00000000000000..6a5f9d9737cfe2
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_cusolver.BUILD.tpl
@@ -0,0 +1,25 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cusolver",
+    hdrs = [":headers"],
+    shared_library = "lib/libcusolver.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = [
+      "include/cusolver_common.h", 
+      "include/cusolverDn.h", 
+      "include/cusolverSp.h"
+    ],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_cusparse.BUILD.tpl b/third_party/gpus/cuda/cuda_cusparse.BUILD.tpl
new file mode 100644
index 00000000000000..ad5c2b5f0c45c1
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_cusparse.BUILD.tpl
@@ -0,0 +1,23 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cusparse",
+    hdrs = [":headers"],
+    shared_library = "lib/libcusparse.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_nccl.BUILD b/third_party/gpus/cuda/cuda_nccl.BUILD
new file mode 100644
index 00000000000000..440b31c5cb616e
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_nccl.BUILD
@@ -0,0 +1,7 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+cc_import(
+    name = "nccl",
+    shared_library = "lib/libnccl.so.2",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/gpus/cuda/cuda_nvcc.BUILD b/third_party/gpus/cuda/cuda_nvcc.BUILD
new file mode 100644
index 00000000000000..6cdaca5cc902a0
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_nvcc.BUILD
@@ -0,0 +1,73 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "bin/nvcc",
+])
+
+filegroup(
+    name = "nvvm",
+    srcs = [
+        "nvvm/libdevice/libdevice.10.bc",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "nvlink",
+    srcs = [
+        "bin/nvlink",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "fatbinary",
+    srcs = [
+        "bin/fatbinary",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "bin2c",
+    srcs = [
+        "bin/bin2c",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "ptxas",
+    srcs = [
+        "bin/ptxas",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "bin",
+    srcs = glob([
+        "bin/**",
+        "nvvm/bin/**",
+    ]),
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "link_stub",
+    srcs = [
+        "bin/crt/link.stub",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_nvjitlink.BUILD.tpl b/third_party/gpus/cuda/cuda_nvjitlink.BUILD.tpl
new file mode 100644
index 00000000000000..6729b7cd1df9c4
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_nvjitlink.BUILD.tpl
@@ -0,0 +1,23 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "nvjitlink",
+    hdrs = [":headers"],
+    shared_library = "lib/libnvJitLink.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_nvml.BUILD b/third_party/gpus/cuda/cuda_nvml.BUILD
new file mode 100644
index 00000000000000..40b97e671cf7de
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_nvml.BUILD
@@ -0,0 +1,12 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/nvml/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_nvprune.BUILD b/third_party/gpus/cuda/cuda_nvprune.BUILD
new file mode 100644
index 00000000000000..986ef0c8f76166
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_nvprune.BUILD
@@ -0,0 +1,9 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+filegroup(
+    name = "nvprune",
+    srcs = [
+        "bin/nvprune",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/gpus/cuda/cuda_nvtx.BUILD b/third_party/gpus/cuda/cuda_nvtx.BUILD
new file mode 100644
index 00000000000000..9823f1d871ed53
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_nvtx.BUILD
@@ -0,0 +1,12 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index fefbf081c87e1c..b8aad7ed4994ee 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -53,6 +53,11 @@ load(
     "realpath",
     "which",
 )
+load(
+    ":compiler_common_tools.bzl",
+    "get_cxx_inc_directories",
+    "to_list_of_strings",
+)
 
 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
 _GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
@@ -67,20 +72,6 @@ _TF_CUDA_CONFIG_REPO = "TF_CUDA_CONFIG_REPO"
 _TF_DOWNLOAD_CLANG = "TF_DOWNLOAD_CLANG"
 _PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
 
-def to_list_of_strings(elements):
-    """Convert the list of ["a", "b", "c"] into '"a", "b", "c"'.
-
-    This is to be used to put a list of strings into the bzl file templates
-    so it gets interpreted as list of strings in Starlark.
-
-    Args:
-      elements: list of string elements
-
-    Returns:
-      single string of elements wrapped in quotes separated by a comma."""
-    quoted_strings = ["\"" + element + "\"" for element in elements]
-    return ", ".join(quoted_strings)
-
 def verify_build_defines(params):
     """Verify all variables that crosstool/BUILD.tpl expects are substituted.
 
@@ -238,156 +229,6 @@ def find_cc(repository_ctx, use_cuda_clang):
               " environment variable").format(target_cc_name, cc_path_envvar))
     return cc
 
-_INC_DIR_MARKER_BEGIN = "#include <...>"
-
-# OSX add " (framework directory)" at the end of line, strip it.
-_OSX_FRAMEWORK_SUFFIX = " (framework directory)"
-_OSX_FRAMEWORK_SUFFIX_LEN = len(_OSX_FRAMEWORK_SUFFIX)
-
-def _cxx_inc_convert(path):
-    """Convert path returned by cc -E xc++ in a complete path."""
-    path = path.strip()
-    if path.endswith(_OSX_FRAMEWORK_SUFFIX):
-        path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
-    return path
-
-def _normalize_include_path(repository_ctx, path):
-    """Normalizes include paths before writing them to the crosstool.
-
-      If path points inside the 'crosstool' folder of the repository, a relative
-      path is returned.
-      If path points outside the 'crosstool' folder, an absolute path is returned.
-      """
-    path = str(repository_ctx.path(path))
-    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
-
-    if path.startswith(crosstool_folder):
-        # We drop the path to "$REPO/crosstool" and a trailing path separator.
-        return path[len(crosstool_folder) + 1:]
-    return path
-
-def _is_compiler_option_supported(repository_ctx, cc, option):
-    """Checks that `option` is supported by the C compiler. Doesn't %-escape the option."""
-    result = repository_ctx.execute([
-        cc,
-        option,
-        "-o",
-        "/dev/null",
-        "-c",
-        str(repository_ctx.path("tools/cpp/empty.cc")),
-    ])
-    return result.stderr.find(option) == -1
-
-def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp, tf_sysroot):
-    """Compute the list of default C or C++ include directories."""
-    if lang_is_cpp:
-        lang = "c++"
-    else:
-        lang = "c"
-    sysroot = []
-    if tf_sysroot:
-        sysroot += ["--sysroot", tf_sysroot]
-    result = raw_exec(repository_ctx, [cc, "-E", "-x" + lang, "-", "-v"] +
-                                      sysroot)
-    stderr = err_out(result)
-    index1 = stderr.find(_INC_DIR_MARKER_BEGIN)
-    if index1 == -1:
-        return []
-    index1 = stderr.find("\n", index1)
-    if index1 == -1:
-        return []
-    index2 = stderr.rfind("\n ")
-    if index2 == -1 or index2 < index1:
-        return []
-    index2 = stderr.find("\n", index2 + 1)
-    if index2 == -1:
-        inc_dirs = stderr[index1 + 1:]
-    else:
-        inc_dirs = stderr[index1 + 1:index2].strip()
-
-    print_resource_dir_supported = _is_compiler_option_supported(
-        repository_ctx,
-        cc,
-        "-print-resource-dir",
-    )
-
-    if print_resource_dir_supported:
-        resource_dir = repository_ctx.execute(
-            [cc, "-print-resource-dir"],
-        ).stdout.strip() + "/share"
-        inc_dirs += "\n" + resource_dir
-
-    compiler_includes = [
-        _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
-        for p in inc_dirs.split("\n")
-    ]
-
-    # The compiler might be on a symlink, e.g. /symlink -> /opt/gcc
-    # The above keeps only the resolved paths to the default includes (e.g. /opt/gcc/include/c++/11)
-    # but Bazel might encounter either (usually reported by the compiler)
-    # especially when a compiler wrapper (e.g. ccache) is used.
-    # So we need to also include paths where symlinks are not resolved.
-
-    # Try to find real path to CC installation to "see through" compiler wrappers
-    # GCC has the path to g++
-    index1 = result.stderr.find("COLLECT_GCC=")
-    if index1 != -1:
-        index1 = result.stderr.find("=", index1)
-        index2 = result.stderr.find("\n", index1)
-        cc_topdir = repository_ctx.path(result.stderr[index1 + 1:index2]).dirname.dirname
-    else:
-        # Clang has the directory
-        index1 = result.stderr.find("InstalledDir: ")
-        if index1 != -1:
-            index1 = result.stderr.find(" ", index1)
-            index2 = result.stderr.find("\n", index1)
-            cc_topdir = repository_ctx.path(result.stderr[index1 + 1:index2]).dirname
-        else:
-            # Fallback to the CC path
-            cc_topdir = repository_ctx.path(cc).dirname.dirname
-
-    # We now have the compiler installation prefix, e.g. /symlink/gcc
-    # And the resolved installation prefix, e.g. /opt/gcc
-    cc_topdir_resolved = str(realpath(repository_ctx, cc_topdir)).strip()
-    cc_topdir = str(cc_topdir).strip()
-
-    # If there is (any!) symlink involved we add paths where the unresolved installation prefix is kept.
-    # e.g. [/opt/gcc/include/c++/11, /opt/gcc/lib/gcc/x86_64-linux-gnu/11/include, /other/path]
-    # adds [/symlink/include/c++/11, /symlink/lib/gcc/x86_64-linux-gnu/11/include]
-    if cc_topdir_resolved != cc_topdir:
-        unresolved_compiler_includes = [
-            cc_topdir + inc[len(cc_topdir_resolved):]
-            for inc in compiler_includes
-            if inc.startswith(cc_topdir_resolved)
-        ]
-        compiler_includes = compiler_includes + unresolved_compiler_includes
-    return compiler_includes
-
-def get_cxx_inc_directories(repository_ctx, cc, tf_sysroot):
-    """Compute the list of default C and C++ include directories."""
-
-    # For some reason `clang -xc` sometimes returns include paths that are
-    # different from the ones from `clang -xc++`. (Symlink and a dir)
-    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
-    includes_cpp = _get_cxx_inc_directories_impl(
-        repository_ctx,
-        cc,
-        True,
-        tf_sysroot,
-    )
-    includes_c = _get_cxx_inc_directories_impl(
-        repository_ctx,
-        cc,
-        False,
-        tf_sysroot,
-    )
-
-    return includes_cpp + [
-        inc
-        for inc in includes_c
-        if inc not in includes_cpp
-    ]
-
 def auto_configure_fail(msg):
     """Output failure message when cuda configuration fails."""
     red = "\033[0;31m"
@@ -1293,6 +1134,7 @@ def _create_local_cuda_repository(repository_ctx):
 
     cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
     cuda_defines["%{unfiltered_compile_flags}"] = ""
+    cuda_defines["%{cuda_nvcc_files}"] = "[]"
     if is_cuda_clang and not is_nvcc_and_clang:
         cuda_defines["%{host_compiler_path}"] = str(cc)
         cuda_defines["%{host_compiler_warnings}"] = """
diff --git a/third_party/gpus/hermetic_cuda_configure.bzl b/third_party/gpus/hermetic_cuda_configure.bzl
new file mode 100644
index 00000000000000..5d16aa6f76a1f4
--- /dev/null
+++ b/third_party/gpus/hermetic_cuda_configure.bzl
@@ -0,0 +1,570 @@
+"""Repository rule for hermetic CUDA autoconfiguration.
+
+`hermetic_cuda_configure` depends on the following environment variables:
+
+  * `TF_NEED_CUDA`: Whether to enable building with CUDA.
+  * `TF_NVCC_CLANG`: Whether to use clang for C++ and NVCC for Cuda compilation.
+  * `CLANG_CUDA_COMPILER_PATH`: The clang compiler path that will be used for
+    both host and device code compilation.
+  * `TF_SYSROOT`: The sysroot to use when compiling.
+  * `TF_CUDA_VERSION`: The version of the CUDA toolkit (mandatory).
+  * `TF_CUDA_COMPUTE_CAPABILITIES`: The CUDA compute capabilities. Default is
+    `3.5,5.2`.
+  * `PYTHON_BIN_PATH`: The python binary path
+"""
+
+load(
+    "//third_party/remote_config:common.bzl",
+    "get_cpu_value",
+    "get_host_environ",
+    "which",
+)
+load(
+    ":compiler_common_tools.bzl",
+    "get_cxx_inc_directories",
+    "to_list_of_strings",
+)
+
+def _find_cc(repository_ctx):
+    """Find the C++ compiler."""
+    cc_path_envvar = _CLANG_CUDA_COMPILER_PATH
+    cc_name = "clang"
+
+    cc_name_from_env = get_host_environ(repository_ctx, cc_path_envvar)
+    if cc_name_from_env:
+        cc_name = cc_name_from_env
+    if cc_name.startswith("/"):
+        # Return the absolute path.
+        return cc_name
+    cc = which(repository_ctx, cc_name)
+    if cc == None:
+        fail(("Cannot find {}, either correct your path or set the {}" +
+              " environment variable").format(cc_name, cc_path_envvar))
+    return cc
+
+def _auto_configure_fail(msg):
+    """Output failure message when cuda configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg))
+
+def _lib_name(base_name, cpu_value, version = None, static = False):
+    """Constructs the platform-specific name of a library.
+
+    Args:
+    base_name: The name of the library, such as "cudart"
+    cpu_value: The name of the host operating system.
+    version: The version of the library.
+    static: True the library is static or False if it is a shared object.
+
+    Returns:
+    The platform-specific name of the library.
+    """
+    version = "" if not version else "." + version
+    if cpu_value in ("Linux"):
+        if static:
+            return "lib%s.a" % base_name
+        return "lib%s.so%s" % (base_name, version)
+    elif cpu_value == "Windows":
+        return "%s.lib" % base_name
+    elif cpu_value == "Darwin":
+        if static:
+            return "lib%s.a" % base_name
+        return "lib%s%s.dylib" % (base_name, version)
+    else:
+        _auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
+
+def _verify_build_defines(params):
+    """Verify all variables that crosstool/BUILD.tpl expects are substituted.
+
+    Args:
+      params: dict of variables that will be passed to the BUILD.tpl template.
+    """
+    missing = []
+    for param in [
+        "cxx_builtin_include_directories",
+        "extra_no_canonical_prefixes_flags",
+        "host_compiler_path",
+        "host_compiler_prefix",
+        "host_compiler_warnings",
+        "linker_bin_path",
+        "compiler_deps",
+        "msvc_cl_path",
+        "msvc_env_include",
+        "msvc_env_lib",
+        "msvc_env_path",
+        "msvc_env_tmp",
+        "msvc_lib_path",
+        "msvc_link_path",
+        "msvc_ml_path",
+        "unfiltered_compile_flags",
+        "win_compiler_deps",
+    ]:
+        if ("%{" + param + "}") not in params:
+            missing.append(param)
+
+    if missing:
+        _auto_configure_fail(
+            "BUILD.tpl template is missing these variables: " +
+            str(missing) +
+            ".\nWe only got: " +
+            str(params) +
+            ".",
+        )
+
+def get_cuda_version(repository_ctx):
+    return get_host_environ(repository_ctx, _TF_CUDA_VERSION)
+
+def enable_cuda(repository_ctx):
+    """Returns whether to build with CUDA support."""
+    return int(get_host_environ(repository_ctx, TF_NEED_CUDA, False))
+
+def _flag_enabled(repository_ctx, flag_name):
+    return get_host_environ(repository_ctx, flag_name) == "1"
+
+def _use_nvcc_and_clang(repository_ctx):
+    # Returns the flag if we need to use clang for C++ and NVCC for Cuda.
+    return _flag_enabled(repository_ctx, _TF_NVCC_CLANG)
+
+def _tf_sysroot(repository_ctx):
+    return get_host_environ(repository_ctx, _TF_SYSROOT, "")
+
+def _py_tmpl_dict(d):
+    return {"%{cuda_config}": str(d)}
+
+def _cudart_static_linkopt(cpu_value):
+    """Returns additional platform-specific linkopts for cudart."""
+    return "\"\"," if cpu_value == "Darwin" else "\"-lrt\","
+
+def _compute_capabilities(repository_ctx):
+    """Returns a list of strings representing cuda compute capabilities.
+
+    Args:
+      repository_ctx: the repo rule's context.
+
+    Returns:
+      list of cuda architectures to compile for. 'compute_xy' refers to
+      both PTX and SASS, 'sm_xy' refers to SASS only.
+    """
+    capabilities = get_host_environ(
+        repository_ctx,
+        _TF_CUDA_COMPUTE_CAPABILITIES,
+        "compute_35,compute_52",
+    ).split(",")
+
+    # Map old 'x.y' capabilities to 'compute_xy'.
+    if len(capabilities) > 0 and all([len(x.split(".")) == 2 for x in capabilities]):
+        # If all capabilities are in 'x.y' format, only include PTX for the
+        # highest capability.
+        cc_list = sorted([x.replace(".", "") for x in capabilities])
+        capabilities = ["sm_%s" % x for x in cc_list[:-1]] + ["compute_%s" % cc_list[-1]]
+    for i, capability in enumerate(capabilities):
+        parts = capability.split(".")
+        if len(parts) != 2:
+            continue
+        capabilities[i] = "compute_%s%s" % (parts[0], parts[1])
+
+    # Make list unique
+    capabilities = dict(zip(capabilities, capabilities)).keys()
+
+    # Validate capabilities.
+    for capability in capabilities:
+        if not capability.startswith(("compute_", "sm_")):
+            _auto_configure_fail("Invalid compute capability: %s" % capability)
+        for prefix in ["compute_", "sm_"]:
+            if not capability.startswith(prefix):
+                continue
+            if len(capability) == len(prefix) + 2 and capability[-2:].isdigit():
+                continue
+            if len(capability) == len(prefix) + 3 and capability.endswith("90a"):
+                continue
+            _auto_configure_fail("Invalid compute capability: %s" % capability)
+
+    return capabilities
+
+def _compute_cuda_extra_copts(compute_capabilities):
+    copts = ["--no-cuda-include-ptx=all"]
+    for capability in compute_capabilities:
+        if capability.startswith("compute_"):
+            capability = capability.replace("compute_", "sm_")
+            copts.append("--cuda-include-ptx=%s" % capability)
+        copts.append("--cuda-gpu-arch=%s" % capability)
+
+    return str(copts)
+
+def _get_cuda_config(repository_ctx):
+    """Detects and returns information about the CUDA installation on the system.
+
+      Args:
+        repository_ctx: The repository context.
+
+      Returns:
+        A struct containing the following fields:
+          cuda_version: The version of CUDA on the system.
+          cudart_version: The CUDA runtime version on the system.
+          cudnn_version: The version of cuDNN on the system.
+          compute_capabilities: A list of the system's CUDA compute capabilities.
+          cpu_value: The name of the host operating system.
+      """
+
+    return struct(
+        cuda_version = get_cuda_version(repository_ctx),
+        cupti_version = repository_ctx.read(repository_ctx.attr.cupti_version),
+        cudart_version = repository_ctx.read(repository_ctx.attr.cudart_version),
+        cublas_version = repository_ctx.read(repository_ctx.attr.cublas_version),
+        cusolver_version = repository_ctx.read(repository_ctx.attr.cusolver_version),
+        curand_version = repository_ctx.read(repository_ctx.attr.curand_version),
+        cufft_version = repository_ctx.read(repository_ctx.attr.cufft_version),
+        cusparse_version = repository_ctx.read(repository_ctx.attr.cusparse_version),
+        cudnn_version = repository_ctx.read(repository_ctx.attr.cudnn_version),
+        compute_capabilities = _compute_capabilities(repository_ctx),
+        cpu_value = get_cpu_value(repository_ctx),
+    )
+
+_DUMMY_CROSSTOOL_BZL_FILE = """
+def error_gpu_disabled():
+  fail("ERROR: Building with --config=cuda but TensorFlow is not configured " +
+       "to build with GPU support. Please re-run ./configure and enter 'Y' " +
+       "at the prompt to build with GPU support.")
+
+  native.genrule(
+      name = "error_gen_crosstool",
+      outs = ["CROSSTOOL"],
+      cmd = "echo 'Should not be run.' && exit 1",
+  )
+
+  native.filegroup(
+      name = "crosstool",
+      srcs = [":CROSSTOOL"],
+      output_licenses = ["unencumbered"],
+  )
+"""
+
+_DUMMY_CROSSTOOL_BUILD_FILE = """
+load("//crosstool:error_gpu_disabled.bzl", "error_gpu_disabled")
+
+error_gpu_disabled()
+"""
+
+def _create_dummy_repository(repository_ctx):
+    cpu_value = get_cpu_value(repository_ctx)
+
+    # Set up BUILD file for cuda/.
+    repository_ctx.template(
+        "cuda/build_defs.bzl",
+        repository_ctx.attr.build_defs_tpl,
+        {
+            "%{cuda_is_configured}": "False",
+            "%{cuda_extra_copts}": "[]",
+            "%{cuda_gpu_architectures}": "[]",
+            "%{cuda_version}": "0.0",
+        },
+    )
+
+    repository_ctx.template(
+        "cuda/BUILD",
+        repository_ctx.attr.dummy_cuda_build_tpl,
+        {
+            "%{cuda_driver_lib}": _lib_name("cuda", cpu_value),
+            "%{cudart_static_lib}": _lib_name(
+                "cudart_static",
+                cpu_value,
+                static = True,
+            ),
+            "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
+            "%{cudart_lib}": _lib_name("cudart", cpu_value),
+            "%{cublas_lib}": _lib_name("cublas", cpu_value),
+            "%{cublasLt_lib}": _lib_name("cublasLt", cpu_value),
+            "%{cusolver_lib}": _lib_name("cusolver", cpu_value),
+            "%{cudnn_lib}": _lib_name("cudnn", cpu_value),
+            "%{cufft_lib}": _lib_name("cufft", cpu_value),
+            "%{curand_lib}": _lib_name("curand", cpu_value),
+            "%{cupti_lib}": _lib_name("cupti", cpu_value),
+            "%{cusparse_lib}": _lib_name("cusparse", cpu_value),
+            "%{cub_actual}": ":cuda_headers",
+            "%{copy_rules}": """
+filegroup(name="cuda-include")
+filegroup(name="cublas-include")
+filegroup(name="cusolver-include")
+filegroup(name="cufft-include")
+filegroup(name="cusparse-include")
+filegroup(name="curand-include")
+filegroup(name="cudnn-include")
+""",
+        },
+    )
+
+    # Create dummy files for the CUDA toolkit since they are still required by
+    # tensorflow/tsl/platform/default/build_config:cuda.
+    repository_ctx.file("cuda/cuda/include/cuda.h")
+    repository_ctx.file("cuda/cuda/include/cublas.h")
+    repository_ctx.file("cuda/cuda/include/cudnn.h")
+    repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h")
+    repository_ctx.file("cuda/cuda/nvml/include/nvml.h")
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cuda", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudart", cpu_value))
+    repository_ctx.file(
+        "cuda/cuda/lib/%s" % _lib_name("cudart_static", cpu_value),
+    )
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cublas", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cublasLt", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cusolver", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudnn", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("curand", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cufft", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cupti", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cusparse", cpu_value))
+
+    # Set up cuda_config.h, which is used by
+    # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
+    repository_ctx.template(
+        "cuda/cuda/cuda_config.h",
+        repository_ctx.attr.cuda_config_tpl,
+        {
+            "%{cuda_version}": "",
+            "%{cudart_version}": "",
+            "%{cupti_version}": "",
+            "%{cublas_version}": "",
+            "%{cusolver_version}": "",
+            "%{curand_version}": "",
+            "%{cufft_version}": "",
+            "%{cusparse_version}": "",
+            "%{cudnn_version}": "",
+            "%{cuda_toolkit_path}": "",
+            "%{cuda_compute_capabilities}": "",
+        },
+    )
+
+    # Set up cuda_config.py, which is used by gen_build_info to provide
+    # static build environment info to the API
+    repository_ctx.template(
+        "cuda/cuda/cuda_config.py",
+        repository_ctx.attr.cuda_config_py_tpl,
+        _py_tmpl_dict({}),
+    )
+
+    # If cuda_configure is not configured to build with GPU support, and the user
+    # attempts to build with --config=cuda, add a dummy build rule to intercept
+    # this and fail with an actionable error message.
+    repository_ctx.file(
+        "crosstool/error_gpu_disabled.bzl",
+        _DUMMY_CROSSTOOL_BZL_FILE,
+    )
+    repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
+
+def _create_local_cuda_repository(repository_ctx):
+    """Creates the repository containing files set up to build with CUDA."""
+    cuda_config = _get_cuda_config(repository_ctx)
+
+    # Set up BUILD file for cuda/
+    repository_ctx.template(
+        "cuda/build_defs.bzl",
+        repository_ctx.attr.build_defs_tpl,
+        {
+            "%{cuda_is_configured}": "True",
+            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
+                cuda_config.compute_capabilities,
+            ),
+            "%{cuda_gpu_architectures}": str(cuda_config.compute_capabilities),
+            "%{cuda_version}": cuda_config.cuda_version,
+        },
+    )
+
+    repository_ctx.template(
+        "cuda/BUILD",
+        repository_ctx.attr.cuda_build_tpl,
+        {
+            "%{cudart_static_linkopt}": _cudart_static_linkopt(cuda_config.cpu_value),
+            "%{cub_actual}": ":cuda_headers",
+        },
+    )
+
+    is_nvcc_and_clang = _use_nvcc_and_clang(repository_ctx)
+    tf_sysroot = _tf_sysroot(repository_ctx)
+
+    # Set up crosstool/
+    cc = _find_cc(repository_ctx)
+    host_compiler_includes = get_cxx_inc_directories(
+        repository_ctx,
+        cc,
+        tf_sysroot,
+    )
+
+    cuda_defines = {}
+
+    # We do not support hermetic CUDA on Windows.
+    # This ensures the CROSSTOOL file parser is happy.
+    cuda_defines.update({
+        "%{msvc_env_tmp}": "msvc_not_used",
+        "%{msvc_env_path}": "msvc_not_used",
+        "%{msvc_env_include}": "msvc_not_used",
+        "%{msvc_env_lib}": "msvc_not_used",
+        "%{msvc_cl_path}": "msvc_not_used",
+        "%{msvc_ml_path}": "msvc_not_used",
+        "%{msvc_link_path}": "msvc_not_used",
+        "%{msvc_lib_path}": "msvc_not_used",
+        "%{win_compiler_deps}": ":empty",
+    })
+
+    cuda_defines["%{builtin_sysroot}"] = tf_sysroot
+    cuda_defines["%{cuda_toolkit_path}"] = repository_ctx.attr.nvcc_binary.workspace_root
+    cuda_defines["%{compiler}"] = "clang"
+    cuda_defines["%{host_compiler_prefix}"] = "/usr/bin"
+    cuda_defines["%{linker_bin_path}"] = ""
+    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
+    cuda_defines["%{unfiltered_compile_flags}"] = ""
+    cuda_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(host_compiler_includes)
+    cuda_defines["%{cuda_nvcc_files}"] = "if_cuda([\"@{nvcc_archive}//:bin\", \"@{nvcc_archive}//:nvvm\"])".format(nvcc_archive = repository_ctx.attr.nvcc_binary.repo_name)
+
+    if not is_nvcc_and_clang:
+        cuda_defines["%{host_compiler_path}"] = str(cc)
+        cuda_defines["%{host_compiler_warnings}"] = """
+        # Some parts of the codebase set -Werror and hit this warning, so
+        # switch it off for now.
+        "-Wno-invalid-partial-specialization"
+    """
+        cuda_defines["%{compiler_deps}"] = ":cuda_nvcc_files"
+        repository_ctx.file(
+            "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
+            "",
+        )
+    else:
+        cuda_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
+        cuda_defines["%{host_compiler_warnings}"] = ""
+
+        nvcc_relative_path = "%s/%s" % (repository_ctx.attr.nvcc_binary.workspace_root, repository_ctx.attr.nvcc_binary.name)
+        cuda_defines["%{compiler_deps}"] = ":crosstool_wrapper_driver_is_not_gcc"
+
+        wrapper_defines = {
+            "%{cpu_compiler}": str(cc),
+            "%{cuda_version}": cuda_config.cuda_version,
+            "%{nvcc_path}": nvcc_relative_path,
+            "%{host_compiler_path}": str(cc),
+            "%{use_clang_compiler}": "True",
+        }
+        repository_ctx.template(
+            "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
+            repository_ctx.attr.crosstool_wrapper_driver_is_not_gcc_tpl,
+            wrapper_defines,
+        )
+
+    _verify_build_defines(cuda_defines)
+
+    # Only expand template variables in the BUILD file
+    repository_ctx.template(
+        "crosstool/BUILD",
+        repository_ctx.attr.crosstool_build_tpl,
+        cuda_defines,
+    )
+
+    # No templating of cc_toolchain_config - use attributes and templatize the
+    # BUILD file.
+    repository_ctx.template(
+        "crosstool/cc_toolchain_config.bzl",
+        repository_ctx.attr.cc_toolchain_config_tpl,
+        {},
+    )
+
+    # Set up cuda_config.h, which is used by
+    # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
+    repository_ctx.template(
+        "cuda/cuda/cuda_config.h",
+        repository_ctx.attr.cuda_config_tpl,
+        {
+            "%{cuda_version}": cuda_config.cuda_version,
+            "%{cudart_version}": cuda_config.cudart_version,
+            "%{cupti_version}": cuda_config.cupti_version,
+            "%{cublas_version}": cuda_config.cublas_version,
+            "%{cusolver_version}": cuda_config.cusolver_version,
+            "%{curand_version}": cuda_config.curand_version,
+            "%{cufft_version}": cuda_config.cufft_version,
+            "%{cusparse_version}": cuda_config.cusparse_version,
+            "%{cudnn_version}": cuda_config.cudnn_version,
+            "%{cuda_toolkit_path}": "",
+            "%{cuda_compute_capabilities}": ", ".join([
+                cc.split("_")[1]
+                for cc in cuda_config.compute_capabilities
+            ]),
+        },
+    )
+
+    # Set up cuda_config.py, which is used by gen_build_info to provide
+    # static build environment info to the API
+    repository_ctx.template(
+        "cuda/cuda/cuda_config.py",
+        repository_ctx.attr.cuda_config_py_tpl,
+        _py_tmpl_dict({
+            "cuda_version": cuda_config.cuda_version,
+            "cudnn_version": cuda_config.cudnn_version,
+            "cuda_compute_capabilities": cuda_config.compute_capabilities,
+            "cpu_compiler": str(cc),
+        }),
+    )
+
+def _cuda_autoconf_impl(repository_ctx):
+    """Implementation of the cuda_autoconf repository rule."""
+    build_file = repository_ctx.attr.local_config_cuda_build_file
+
+    if not enable_cuda(repository_ctx):
+        _create_dummy_repository(repository_ctx)
+    else:
+        _create_local_cuda_repository(repository_ctx)
+
+    repository_ctx.symlink(build_file, "BUILD")
+
+_CLANG_CUDA_COMPILER_PATH = "CLANG_CUDA_COMPILER_PATH"
+_PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
+_TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
+_TF_CUDA_VERSION = "TF_CUDA_VERSION"
+TF_NEED_CUDA = "TF_NEED_CUDA"
+_TF_NVCC_CLANG = "TF_NVCC_CLANG"
+_TF_SYSROOT = "TF_SYSROOT"
+
+_ENVIRONS = [
+    _CLANG_CUDA_COMPILER_PATH,
+    TF_NEED_CUDA,
+    _TF_NVCC_CLANG,
+    _TF_CUDA_VERSION,
+    _TF_CUDA_COMPUTE_CAPABILITIES,
+    _TF_SYSROOT,
+    _PYTHON_BIN_PATH,
+    "TMP",
+    "TMPDIR",
+]
+
+hermetic_cuda_configure = repository_rule(
+    implementation = _cuda_autoconf_impl,
+    environ = _ENVIRONS,
+    attrs = {
+        "environ": attr.string_dict(),
+        "cublas_version": attr.label(default = Label("@cuda_cublas//:version.txt")),
+        "cudart_version": attr.label(default = Label("@cuda_cudart//:version.txt")),
+        "cudnn_version": attr.label(default = Label("@cuda_cudnn//:version.txt")),
+        "cufft_version": attr.label(default = Label("@cuda_cufft//:version.txt")),
+        "cupti_version": attr.label(default = Label("@cuda_cupti//:version.txt")),
+        "curand_version": attr.label(default = Label("@cuda_curand//:version.txt")),
+        "cusolver_version": attr.label(default = Label("@cuda_cusolver//:version.txt")),
+        "cusparse_version": attr.label(default = Label("@cuda_cusparse//:version.txt")),
+        "nvcc_binary": attr.label(default = Label("@cuda_nvcc//:bin/nvcc")),
+        "local_config_cuda_build_file": attr.label(default = Label("//third_party/gpus:local_config_cuda.BUILD")),
+        "build_defs_tpl": attr.label(default = Label("//third_party/gpus/cuda:build_defs.bzl.tpl")),
+        "cuda_build_tpl": attr.label(default = Label("//third_party/gpus/cuda:BUILD.hermetic.tpl")),
+        "dummy_cuda_build_tpl": attr.label(default = Label("//third_party/gpus/cuda:BUILD.tpl")),
+        "cuda_config_tpl": attr.label(default = Label("//third_party/gpus/cuda:cuda_config.h.tpl")),
+        "cuda_config_py_tpl": attr.label(default = Label("//third_party/gpus/cuda:cuda_config.py.tpl")),
+        "crosstool_wrapper_driver_is_not_gcc_tpl": attr.label(default = Label("//third_party/gpus/crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl")),
+        "crosstool_build_tpl": attr.label(default = Label("//third_party/gpus/crosstool:BUILD.tpl")),
+        "cc_toolchain_config_tpl": attr.label(default = Label("//third_party/gpus/crosstool:cc_toolchain_config.bzl.tpl")),
+    },
+)
+"""Detects and configures the hermetic CUDA toolchain.
+
+Add the following to your WORKSPACE FILE:
+
+```python
+hermetic cuda_configure(name = "local_config_cuda")
+```
+
+Args:
+  name: A unique name for this workspace rule.
+"""
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 0fd4019fc5bb75..cf756b452e3950 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -22,12 +22,15 @@ load(
     "realpath",
     "which",
 )
+load(
+    ":compiler_common_tools.bzl",
+    "to_list_of_strings",
+)
 load(
     ":cuda_configure.bzl",
     "enable_cuda",
     "make_copy_dir_rule",
     "make_copy_files_rule",
-    "to_list_of_strings",
 )
 load(
     ":sycl_configure.bzl",
diff --git a/third_party/gpus/sycl_configure.bzl b/third_party/gpus/sycl_configure.bzl
index 05330b2fe53195..dd80694e7274f5 100644
--- a/third_party/gpus/sycl_configure.bzl
+++ b/third_party/gpus/sycl_configure.bzl
@@ -16,11 +16,14 @@ load(
     "realpath",
     "which",
 )
+load(
+    ":compiler_common_tools.bzl",
+    "to_list_of_strings",
+)
 load(
     ":cuda_configure.bzl",
     "make_copy_dir_rule",
     "make_copy_files_rule",
-    "to_list_of_strings",
 )
 
 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
diff --git a/third_party/nccl/build_defs.bzl.tpl b/third_party/nccl/build_defs.bzl.tpl
index 53a6d4e1e41890..a0930df34ecec8 100644
--- a/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/nccl/build_defs.bzl.tpl
@@ -5,7 +5,6 @@ load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
 
 # CUDA toolkit version as tuple (e.g. '(11, 1)').
 _cuda_version = %{cuda_version}
-_cuda_clang = %{cuda_clang}
 
 def _rdc_copts():
     """Returns copts for compiling relocatable device code."""
@@ -121,25 +120,25 @@ _device_link = rule(
         "gpu_archs": attr.string_list(),
         "nvlink_args": attr.string_list(),
         "_nvlink": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/nvlink"),
+            default = Label("%{nvlink_label}"),
             allow_single_file = True,
             executable = True,
             cfg = "host",
         ),
         "_fatbinary": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/fatbinary"),
+            default = Label("%{fatbinary_label}"),
             allow_single_file = True,
             executable = True,
             cfg = "host",
         ),
         "_bin2c": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/bin2c"),
+            default = Label("%{bin2c_label}"),
             allow_single_file = True,
             executable = True,
             cfg = "host",
         ),
         "_link_stub": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/crt/link.stub"),
+            default = Label("%{link_stub_label}"),
             allow_single_file = True,
         ),
     },
@@ -189,7 +188,7 @@ _prune_relocatable_code = rule(
         "input": attr.label(mandatory = True, allow_files = True),
         "gpu_archs": attr.string_list(),
         "_nvprune": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/nvprune"),
+            default = Label("%{nvprune_label}"),
             allow_single_file = True,
             executable = True,
             cfg = "host",
diff --git a/third_party/nccl/hermetic_nccl_configure.bzl b/third_party/nccl/hermetic_nccl_configure.bzl
new file mode 100644
index 00000000000000..b99cbcb08db58a
--- /dev/null
+++ b/third_party/nccl/hermetic_nccl_configure.bzl
@@ -0,0 +1,153 @@
+"""Repository rule for hermetic NCCL configuration.
+
+`hermetic_nccl_configure` depends on the following environment variables:
+
+  * `TF_NCCL_USE_STUB`: "1" if a NCCL stub that loads NCCL dynamically should
+    be used, "0" if NCCL should be linked in statically.
+
+"""
+
+load(
+    "//third_party/gpus:hermetic_cuda_configure.bzl",
+    "TF_NEED_CUDA",
+    "enable_cuda",
+    "get_cuda_version",
+)
+load(
+    "//third_party/remote_config:common.bzl",
+    "get_cpu_value",
+    "get_host_environ",
+)
+
+_TF_NCCL_USE_STUB = "TF_NCCL_USE_STUB"
+
+_NCCL_DUMMY_BUILD_CONTENT = """
+filegroup(
+  name = "LICENSE",
+  visibility = ["//visibility:public"],
+)
+
+cc_library(
+  name = "nccl",
+  visibility = ["//visibility:public"],
+)
+
+cc_library(
+  name = "nccl_config",
+  hdrs = ["nccl_config.h"],
+  include_prefix = "third_party/nccl",
+  visibility = ["//visibility:public"],
+)
+"""
+
+_NCCL_ARCHIVE_BUILD_CONTENT = """
+filegroup(
+  name = "LICENSE",
+  data = ["@nccl_archive//:LICENSE.txt"],
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl",
+  actual = "@nccl_archive//:nccl",
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl_config",
+  actual = "@nccl_archive//:nccl_config",
+  visibility = ["//visibility:public"],
+)
+"""
+
+_NCCL_ARCHIVE_STUB_BUILD_CONTENT = """
+alias(
+  name = "nccl_lib",
+  actual = "@cuda_nccl//:nccl_lib",
+)
+
+filegroup(
+  name = "LICENSE",
+  data = ["@nccl_archive//:LICENSE.txt"],
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl",
+  actual = "@nccl_archive//:nccl_via_stub",
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl_headers",
+  actual = "@nccl_archive//:nccl_headers",
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl_config",
+  actual = "@nccl_archive//:nccl_config",
+  visibility = ["//visibility:public"],
+)
+"""
+
+def _create_local_nccl_repository(repository_ctx):
+    cuda_version = get_cuda_version(repository_ctx)
+    if cuda_version == "12":
+        cuda_version = "12.3"
+    cuda_version = cuda_version.split(".")
+
+    # Alias to open source build from @nccl_archive.
+    if get_host_environ(repository_ctx, _TF_NCCL_USE_STUB, "0") == "0":
+        repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
+    else:
+        repository_ctx.file("BUILD", _NCCL_ARCHIVE_STUB_BUILD_CONTENT)
+
+    repository_ctx.template("generated_names.bzl", repository_ctx.attr.generated_names_tpl, {})
+    repository_ctx.template(
+        "build_defs.bzl",
+        repository_ctx.attr.build_defs_tpl,
+        {
+            "%{cuda_version}": "(%s, %s)" % tuple(cuda_version),
+            "%{nvlink_label}": "@cuda_nvcc//:nvlink",
+            "%{fatbinary_label}": "@cuda_nvcc//:fatbinary",
+            "%{bin2c_label}": "@cuda_nvcc//:bin2c",
+            "%{link_stub_label}": "@cuda_nvcc//:link_stub",
+            "%{nvprune_label}": "@cuda_nvprune//:nvprune",
+        },
+    )
+
+def _nccl_autoconf_impl(repository_ctx):
+    if (not enable_cuda(repository_ctx) or
+        get_cpu_value(repository_ctx) not in ("Linux", "FreeBSD")):
+        # Add a dummy build file to make bazel query happy.
+        repository_ctx.file("BUILD", _NCCL_DUMMY_BUILD_CONTENT)
+        repository_ctx.file("nccl_config.h", "#define TF_NCCL_VERSION \"\"")
+    else:
+        _create_local_nccl_repository(repository_ctx)
+
+_ENVIRONS = [
+    TF_NEED_CUDA,
+]
+
+hermetic_nccl_configure = repository_rule(
+    environ = _ENVIRONS,
+    implementation = _nccl_autoconf_impl,
+    attrs = {
+        "environ": attr.string_dict(),
+        "generated_names_tpl": attr.label(default = Label("//third_party/nccl:generated_names.bzl.tpl")),
+        "build_defs_tpl": attr.label(default = Label("//third_party/nccl:build_defs.bzl.tpl")),
+        "system_build_tpl": attr.label(default = Label("//third_party/nccl:system.BUILD.tpl")),
+    },
+)
+"""Downloads and configures the hermetic NCCL configuration.
+
+Add the following to your WORKSPACE FILE:
+
+```python
+hermetic_nccl_configure(name = "local_config_nccl")
+```
+
+Args:
+  name: A unique name for this workspace rule.
+"""
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index 22cf64d4771062..4da2513e03eb44 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -8,7 +8,6 @@
     files.
   * `TF_CUDA_PATHS`: The base paths to look for CUDA and cuDNN. Default is
     `/usr/local/cuda,usr/`.
-  * `TF_CUDA_CLANG`: "1" if using Clang, "0" if using NVCC.
   * `TF_NCCL_USE_STUB`: "1" if a NCCL stub that loads NCCL dynamically should
     be used, "0" if NCCL should be linked in statically.
 
@@ -33,7 +32,6 @@ _TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
 _TF_NCCL_VERSION = "TF_NCCL_VERSION"
 _TF_NEED_CUDA = "TF_NEED_CUDA"
 _TF_CUDA_PATHS = "TF_CUDA_PATHS"
-_TF_CUDA_CLANG = "TF_CUDA_CLANG"
 _TF_NCCL_USE_STUB = "TF_NCCL_USE_STUB"
 
 _DEFINE_NCCL_MAJOR = "#define NCCL_MAJOR"
@@ -129,7 +127,11 @@ def _create_local_nccl_repository(repository_ctx):
             _label("build_defs.bzl.tpl"),
             {
                 "%{cuda_version}": "(%s, %s)" % tuple(cuda_version),
-                "%{cuda_clang}": repr(get_host_environ(repository_ctx, _TF_CUDA_CLANG)),
+                "%{nvlink_label}": "@local_config_cuda//cuda:cuda/bin/nvlink",
+                "%{fatbinary_label}": "@local_config_cuda//cuda:cuda/bin/fatbinary",
+                "%{bin2c_label}": "@local_config_cuda//cuda:cuda/bin/bin2c",
+                "%{link_stub_label}": "@local_config_cuda//cuda:cuda/bin/crt/link.stub",
+                "%{nvprune_label}": "@local_config_cuda//cuda:cuda/bin/nvprune",
             },
         )
     else:
@@ -181,7 +183,6 @@ _ENVIRONS = [
     _TF_CUDA_COMPUTE_CAPABILITIES,
     _TF_NEED_CUDA,
     _TF_CUDA_PATHS,
-    _TF_CUDA_CLANG,
 ]
 
 remote_nccl_configure = repository_rule(
diff --git a/third_party/xla/.bazelrc b/third_party/xla/.bazelrc
index 02dec0349c4741..c17ae4494dc99c 100644
--- a/third_party/xla/.bazelrc
+++ b/third_party/xla/.bazelrc
@@ -226,12 +226,13 @@ build:cuda --repo_env TF_NEED_CUDA=1
 build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda --@local_config_cuda//:enable_cuda
 
+build:no_cuda_libs --@local_config_cuda//cuda:include_hermetic_cuda_libs=false
+
 # CUDA: This config refers to building CUDA op kernels with clang.
 build:cuda_clang --config=cuda
-# Enable TensorRT optimizations https://developer.nvidia.com/tensorrt
-build:cuda_clang --config=tensorrt
 build:cuda_clang --action_env=TF_CUDA_CLANG="1"
 build:cuda_clang --@local_config_cuda//:cuda_compiler=clang
+build:cuda_clang --copt=-Qunused-arguments
 # Select supported compute capabilities (supported graphics cards).
 # This is the same as the official TensorFlow builds.
 # See https://developer.nvidia.com/cuda-gpus#compute
@@ -244,12 +245,10 @@ build:cuda_clang --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_60,sm_70,sm_80,sm_8
 
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 build:cuda_clang_official --config=cuda_clang
-build:cuda_clang_official --action_env=TF_CUDA_VERSION="12"
-build:cuda_clang_official --action_env=TF_CUDNN_VERSION="8"
-build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.3"
+build:cuda_clang_official --action_env=TF_CUDA_VERSION="12.3"
+build:cuda_clang_official --action_env=TF_CUDNN_VERSION="8.9"
 build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
-build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 build:cuda_clang_official --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
 
 # Build with nvcc for CUDA and clang for host
@@ -545,10 +544,6 @@ build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
 # For Remote build execution -- GPU configuration
 build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
-build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.17-clang_config_cuda"
-build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.17-clang_config_tensorrt"
-build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.17-clang_config_nccl"
-test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
 build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
 build:rbe_linux_cuda_nvcc --config=nvcc_clang
@@ -633,7 +628,6 @@ build:release_cpu_linux_base --repo_env=BAZEL_COMPILER="/usr/lib/llvm-17/bin/cla
 # Test-related settings below this point.
 test:release_linux_base --build_tests_only --keep_going --test_output=errors --verbose_failures=true
 test:release_linux_base --local_test_jobs=HOST_CPUS
-test:release_linux_base --test_env=LD_LIBRARY_PATH
 # Give only the list of failed tests at the end of the log
 test:release_linux_base --test_summary=short
 
@@ -645,7 +639,6 @@ build:release_gpu_linux --config=release_cpu_linux
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 # Note that linux cpu and cuda builds share the same toolchain now.
 build:release_gpu_linux --config=cuda_clang_official
-test:release_gpu_linux --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 # Local test jobs has to be 4 because parallel_gpu_execute is fragile, I think
 test:release_gpu_linux --test_timeout=300,450,1200,3600 --local_test_jobs=4 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute
 
@@ -674,11 +667,8 @@ test:unsupported_cpu_linux --config=release_base
 build:unsupported_gpu_linux --config=cuda
 build:unsupported_gpu_linux --config=unsupported_cpu_linux
 build:unsupported_gpu_linux --action_env=TF_CUDA_VERSION="11"
-build:unsupported_gpu_linux --action_env=TF_CUDNN_VERSION="8"
+build:unsupported_gpu_linux --action_env=TF_CUDNN_VERSION="8.6"
 build:unsupported_gpu_linux --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_50,sm_60,sm_70,sm_75,compute_80"
-build:unsupported_gpu_linux --config=tensorrt
-build:unsupported_gpu_linux --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-11.2"
-build:unsupported_gpu_linux --action_env=LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.1/lib64:/usr/local/tensorrt/lib"
 build:unsupported_gpu_linux --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:unsupported_gpu_linux --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain
 
diff --git a/third_party/xla/build_tools/configure/configure.py b/third_party/xla/build_tools/configure/configure.py
index 663e4b8724280d..74d571909a2c57 100755
--- a/third_party/xla/build_tools/configure/configure.py
+++ b/third_party/xla/build_tools/configure/configure.py
@@ -27,11 +27,6 @@
   the clang in your path. If that isn't the correct clang, you can override like
   `./configure.py --backend=cpu --clang_path=<PATH_TO_YOUR_CLANG>`.
 
-NOTE(ddunleavy): Lots of these things should probably be outside of configure.py
-but are here because of complexity in `cuda_configure.bzl` and the TF bazelrc.
-Once XLA has it's own bazelrc, and cuda_configure.bzl is replaced or refactored,
-we can probably make this file smaller.
-
 TODO(ddunleavy): add more thorough validation.
 """
 import argparse
@@ -45,18 +40,10 @@
 import sys
 from typing import Optional
 
-_REQUIRED_CUDA_LIBRARIES = ["cublas", "cuda", "cudnn"]
+_REQUIRED_CUDA_LIBRARIES = ["cuda", "cudnn"]
 _DEFAULT_BUILD_AND_TEST_TAG_FILTERS = ("-no_oss",)
 # Assume we are being invoked from the symlink at the root of the repo
 _XLA_SRC_ROOT = pathlib.Path(__file__).absolute().parent
-_FIND_CUDA_CONFIG = str(
-    _XLA_SRC_ROOT
-    / "third_party"
-    / "tsl"
-    / "third_party"
-    / "gpus"
-    / "find_cuda_config.py"
-)
 _XLA_BAZELRC_NAME = "xla_configure.bazelrc"
 _KW_ONLY_IF_PYTHON310 = {"kw_only": True} if sys.version_info >= (3, 10) else {}
 
@@ -218,11 +205,9 @@ class DiscoverablePathsAndVersions:
   ld_library_path: Optional[str] = None
 
   # CUDA specific
-  cublas_version: Optional[str] = None
-  cuda_toolkit_path: Optional[str] = None
+  cuda_version: Optional[str] = None
   cuda_compute_capabilities: Optional[list[str]] = None
   cudnn_version: Optional[str] = None
-  nccl_version: Optional[str] = None
 
   def get_relevant_paths_and_versions(self, config: "XLAConfigOptions"):
     """Gets paths and versions as needed by the config.
@@ -241,7 +226,7 @@ def get_relevant_paths_and_versions(self, config: "XLAConfigOptions"):
       )
 
       # Notably, we don't use `_find_executable_or_die` for lld, as it changes
-      # which commands it accepts based on it's name! ld.lld is symlinked to a
+      # which commands it accepts based on its name! ld.lld is symlinked to a
       # different executable just called lld, which should not be invoked
       # directly.
       self.lld_path = self.lld_path or shutil.which("ld.lld")
@@ -255,64 +240,6 @@ def get_relevant_paths_and_versions(self, config: "XLAConfigOptions"):
       if not self.cuda_compute_capabilities:
         self.cuda_compute_capabilities = _get_cuda_compute_capabilities_or_die()
 
-      self._get_cuda_libraries_paths_and_versions_if_needed(config)
-
-  def _get_cuda_libraries_paths_and_versions_if_needed(
-      self, config: "XLAConfigOptions"
-  ):
-    """Gets cuda paths and versions if user left any unspecified.
-
-    This uses `find_cuda_config.py` to find versions for all libraries in
-    `_REQUIRED_CUDA_LIBRARIES`.
-
-    Args:
-      config: config that determines which libraries should be found.
-    """
-    should_find_nccl = config.using_nccl and self.nccl_version is None
-    any_cuda_config_unset = any([
-        self.cublas_version is None,
-        self.cuda_toolkit_path is None,
-        self.cudnn_version is None,
-        should_find_nccl,
-    ])
-
-    maybe_nccl = ["nccl"] if should_find_nccl else []
-
-    if any_cuda_config_unset:
-      logging.info(
-          "Some CUDA config versions and paths were not provided, "
-          "so trying to find them using find_cuda_config.py"
-      )
-      try:
-        find_cuda_config_proc = subprocess.run(
-            [
-                sys.executable,
-                _FIND_CUDA_CONFIG,
-                *_REQUIRED_CUDA_LIBRARIES,
-                *maybe_nccl,
-            ],
-            capture_output=True,
-            check=True,
-            text=True,
-        )
-      except subprocess.CalledProcessError as e:
-        logging.info("Command %s failed. Is CUDA installed?", e.cmd)
-        logging.info("Dumping %s ouptut:\n %s", e.cmd, e.output)
-        raise e
-
-      cuda_config = dict(
-          tuple(line.split(": "))
-          for line in find_cuda_config_proc.stdout.strip().split("\n")
-      )
-
-      self.cublas_version = self.cublas_version or cuda_config["cublas_version"]
-      self.cuda_toolkit_path = (
-          self.cuda_toolkit_path or cuda_config["cuda_toolkit_path"]
-      )
-      self.cudnn_version = self.cudnn_version or cuda_config["cudnn_version"]
-      if should_find_nccl:
-        self.nccl_version = self.nccl_version or cuda_config["nccl_version"]
-
 
 @dataclasses.dataclass(frozen=True, **_KW_ONLY_IF_PYTHON310)
 class XLAConfigOptions:
@@ -327,7 +254,6 @@ class XLAConfigOptions:
   # CUDA specific
   cuda_compiler: CudaCompiler
   using_nccl: bool
-  using_tensorrt: bool
 
   def to_bazelrc_lines(
       self,
@@ -386,19 +312,13 @@ def to_bazelrc_lines(
         )
 
       # Lines needed for CUDA backend regardless of CUDA/host compiler
-      rc.append(
-          f"build --action_env CUDA_TOOLKIT_PATH={dpav.cuda_toolkit_path}"
-      )
-      rc.append(f"build --action_env TF_CUBLAS_VERSION={dpav.cublas_version}")
+      rc.append(f"build --action_env TF_CUDA_VERSION={dpav.cuda_version}")
       rc.append(
           "build --action_env"
           f" TF_CUDA_COMPUTE_CAPABILITIES={','.join(dpav.cuda_compute_capabilities)}"
       )
       rc.append(f"build --action_env TF_CUDNN_VERSION={dpav.cudnn_version}")
-      rc.append(f"build --repo_env TF_NEED_TENSORRT={int(self.using_tensorrt)}")
-      if self.using_nccl:
-        rc.append(f"build --action_env TF_NCCL_VERSION={dpav.nccl_version}")
-      else:
+      if not self.using_nccl:
         rc.append("build --config nonccl")
     elif self.backend == Backend.ROCM:
       pass
@@ -468,7 +388,6 @@ def _parse_args():
       default="-Wno-sign-compare",
   )
   parser.add_argument("--nccl", action="store_true")
-  parser.add_argument("--tensorrt", action="store_true")
 
   # Path and version overrides
   path_help = "Optional: will be found on PATH if possible."
@@ -484,13 +403,16 @@ def _parse_args():
   parser.add_argument("--lld_path", help=path_help)
 
   # CUDA specific
-  find_cuda_config_help = (
-      "Optional: will be found using `find_cuda_config.py` if flag is not set."
+  parser.add_argument(
+      "--cuda_version",
+      help="Optional: CUDA will be downloaded by Bazel if the flag is provided",
+  )
+  parser.add_argument(
+      "--cudnn_version",
+      help=(
+          "Optional: CUDNN will be downloaded by Bazel if the flag is provided"
+      ),
   )
-  parser.add_argument("--cublas_version", help=find_cuda_config_help)
-  parser.add_argument("--cuda_toolkit_path", help=find_cuda_config_help)
-  parser.add_argument("--cudnn_version", help=find_cuda_config_help)
-  parser.add_argument("--nccl_version", help=find_cuda_config_help)
 
   return parser.parse_args()
 
@@ -510,7 +432,6 @@ def main():
       python_bin_path=args.python_bin_path,
       compiler_options=args.compiler_options,
       using_nccl=args.nccl,
-      using_tensorrt=args.tensorrt,
   )
 
   bazelrc_lines = config.to_bazelrc_lines(
@@ -519,11 +440,9 @@ def main():
           gcc_path=args.gcc_path,
           lld_path=args.lld_path,
           ld_library_path=args.ld_library_path,
-          cublas_version=args.cublas_version,
-          cuda_compute_capabilities=args.cuda_compute_capabilities,
-          cuda_toolkit_path=args.cuda_toolkit_path,
+          cuda_version=args.cuda_version,
           cudnn_version=args.cudnn_version,
-          nccl_version=args.nccl_version,
+          cuda_compute_capabilities=args.cuda_compute_capabilities,
       )
   )
 
diff --git a/third_party/xla/build_tools/configure/configure_test.py b/third_party/xla/build_tools/configure/configure_test.py
index c952c8f9241f4f..8a1ca1ab3c699f 100644
--- a/third_party/xla/build_tools/configure/configure_test.py
+++ b/third_party/xla/build_tools/configure/configure_test.py
@@ -32,12 +32,10 @@
 
 # CUDA specific paths and versions
 _CUDA_SPECIFIC_PATHS_AND_VERSIONS = {
-    "cublas_version": "12.3",
-    "cuda_toolkit_path": "/usr/local/cuda-12.2",
+    "cuda_version": "12.3",
     "cuda_compute_capabilities": ["7.5"],
     "cudnn_version": "8",
     "ld_library_path": "/usr/local/nvidia/lib:/usr/local/nvidia/lib64",
-    "nccl_version": "2",
 }
 
 
@@ -75,7 +73,6 @@ def test_clang_bazelrc(self):
         compiler_options=list(_COMPILER_OPTIONS),
         cuda_compiler=CudaCompiler.NVCC,
         using_nccl=False,
-        using_tensorrt=False,
     )
 
     bazelrc_lines = config.to_bazelrc_lines(
@@ -97,7 +94,6 @@ def test_gcc_bazelrc(self):
         compiler_options=list(_COMPILER_OPTIONS),
         cuda_compiler=CudaCompiler.NVCC,
         using_nccl=False,
-        using_tensorrt=False,
     )
 
     bazelrc_lines = config.to_bazelrc_lines(
@@ -118,7 +114,6 @@ def test_cuda_clang_bazelrc(self):
         compiler_options=list(_COMPILER_OPTIONS),
         cuda_compiler=CudaCompiler.CLANG,
         using_nccl=False,
-        using_tensorrt=False,
     )
 
     bazelrc_lines = config.to_bazelrc_lines(
@@ -140,7 +135,6 @@ def test_nvcc_clang_bazelrc(self):
         compiler_options=list(_COMPILER_OPTIONS),
         cuda_compiler=CudaCompiler.NVCC,
         using_nccl=False,
-        using_tensorrt=False,
     )
 
     bazelrc_lines = config.to_bazelrc_lines(
@@ -162,7 +156,6 @@ def test_nvcc_gcc_bazelrc(self):
         compiler_options=list(_COMPILER_OPTIONS),
         cuda_compiler=CudaCompiler.NVCC,
         using_nccl=False,
-        using_tensorrt=False,
     )
 
     bazelrc_lines = config.to_bazelrc_lines(
diff --git a/third_party/xla/build_tools/configure/testdata/cuda_clang.bazelrc b/third_party/xla/build_tools/configure/testdata/cuda_clang.bazelrc
index b998cf06935f33..62c5224a98dd19 100644
--- a/third_party/xla/build_tools/configure/testdata/cuda_clang.bazelrc
+++ b/third_party/xla/build_tools/configure/testdata/cuda_clang.bazelrc
@@ -3,11 +3,9 @@ build --repo_env CC=/usr/lib/llvm-17/bin/clang
 build --repo_env BAZEL_COMPILER=/usr/lib/llvm-17/bin/clang
 build --config cuda_clang
 build --action_env CLANG_CUDA_COMPILER_PATH=/usr/lib/llvm-17/bin/clang
-build --action_env CUDA_TOOLKIT_PATH=/usr/local/cuda-12.2
-build --action_env TF_CUBLAS_VERSION=12.3
+build --action_env TF_CUDA_VERSION=12.3
 build --action_env TF_CUDA_COMPUTE_CAPABILITIES=7.5
 build --action_env TF_CUDNN_VERSION=8
-build --repo_env TF_NEED_TENSORRT=0
 build --config nonccl
 build --action_env LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 build --action_env PYTHON_BIN_PATH=/usr/bin/python3
diff --git a/third_party/xla/build_tools/configure/testdata/nvcc_clang.bazelrc b/third_party/xla/build_tools/configure/testdata/nvcc_clang.bazelrc
index 912dc50faff4c1..ab3d220cdea804 100644
--- a/third_party/xla/build_tools/configure/testdata/nvcc_clang.bazelrc
+++ b/third_party/xla/build_tools/configure/testdata/nvcc_clang.bazelrc
@@ -3,11 +3,9 @@ build --repo_env CC=/usr/lib/llvm-17/bin/clang
 build --repo_env BAZEL_COMPILER=/usr/lib/llvm-17/bin/clang
 build --config nvcc_clang
 build --action_env CLANG_CUDA_COMPILER_PATH=/usr/lib/llvm-17/bin/clang
-build --action_env CUDA_TOOLKIT_PATH=/usr/local/cuda-12.2
-build --action_env TF_CUBLAS_VERSION=12.3
+build --action_env TF_CUDA_VERSION=12.3
 build --action_env TF_CUDA_COMPUTE_CAPABILITIES=7.5
 build --action_env TF_CUDNN_VERSION=8
-build --repo_env TF_NEED_TENSORRT=0
 build --config nonccl
 build --action_env LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 build --action_env PYTHON_BIN_PATH=/usr/bin/python3
diff --git a/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc b/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc
index 863209697362de..e27b41ffc01a99 100644
--- a/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc
+++ b/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc
@@ -1,10 +1,8 @@
 build --action_env GCC_HOST_COMPILER_PATH=/usr/bin/gcc
 build --config cuda
-build --action_env CUDA_TOOLKIT_PATH=/usr/local/cuda-12.2
-build --action_env TF_CUBLAS_VERSION=12.3
+build --action_env TF_CUDA_VERSION=12.3
 build --action_env TF_CUDA_COMPUTE_CAPABILITIES=7.5
 build --action_env TF_CUDNN_VERSION=8
-build --repo_env TF_NEED_TENSORRT=0
 build --config nonccl
 build --action_env LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 build --action_env PYTHON_BIN_PATH=/usr/bin/python3
diff --git a/third_party/xla/opensource_only.files b/third_party/xla/opensource_only.files
index 7655cabdafeb6b..3a7d2c1499c8b2 100644
--- a/third_party/xla/opensource_only.files
+++ b/third_party/xla/opensource_only.files
@@ -9,6 +9,8 @@ third_party/BUILD:
 third_party/__init__:.py
 third_party/compute_library/BUILD:
 third_party/compute_library/build_defs.bzl:
+third_party/cuda_redist_json_repo.bzl:
+third_party/cuda_repo.bzl:
 third_party/implib_so/BUILD:
 third_party/implib_so/get_symbols.py:
 third_party/implib_so/make_stub.py:
diff --git a/third_party/xla/third_party/cuda_redist_json_repo.bzl b/third_party/xla/third_party/cuda_redist_json_repo.bzl
new file mode 100644
index 00000000000000..76941dd74c9488
--- /dev/null
+++ b/third_party/xla/third_party/cuda_redist_json_repo.bzl
@@ -0,0 +1,110 @@
+# Copyright 2024 The TensorFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for defining CUDA and cuDNN JSON files with distributives versions."""
+
+load("//third_party:repo.bzl", "tf_mirror_urls")
+
+_DEFAULT_CUDA_VERSION = "12.3.2"
+_DEFAULT_CUDNN_VERSION = "8.9.7.29"
+
+def _get_env_var(ctx, name):
+    if name in ctx.os.environ:
+        return ctx.os.environ[name]
+    else:
+        return None
+
+def _cuda_redist_json_impl(repository_ctx):
+    cuda_version = _get_env_var(repository_ctx, "TF_CUDA_VERSION")
+    cudnn_version = _get_env_var(repository_ctx, "TF_CUDNN_VERSION")
+    supported_cuda_versions = repository_ctx.attr.cuda_json_dict.keys()
+    if cuda_version and (cuda_version not in supported_cuda_versions):
+        if cuda_version in ["12", "12.3"]:
+            cuda_version = _DEFAULT_CUDA_VERSION
+        else:
+            fail(
+                ("The supported CUDA versions are {supported_versions}." +
+                 " Please provide a supported version in TF_CUDA_VERSION" +
+                 " environment variable or add JSON URL for" +
+                 " CUDA version={version}.")
+                    .format(
+                    supported_versions = supported_cuda_versions,
+                    version = cuda_version,
+                ),
+            )
+    supported_cudnn_versions = repository_ctx.attr.cudnn_json_dict.keys()
+    if cudnn_version and (cudnn_version not in supported_cudnn_versions):
+        if cudnn_version in ["8", "8.9"]:
+            cudnn_version = _DEFAULT_CUDNN_VERSION
+        else:
+            fail(
+                ("The supported CUDNN versions are {supported_versions}." +
+                 " Please provide a supported version in TF_CUDNN_VERSION" +
+                 " environment variable or add JSON URL for" +
+                 " CUDNN version={version}.")
+                    .format(
+                    supported_versions = supported_cudnn_versions,
+                    version = cudnn_version,
+                ),
+            )
+    cuda_distributives = "{}"
+    cudnn_distributives = "{}"
+    if cuda_version:
+        (url, sha256) = repository_ctx.attr.cuda_json_dict[cuda_version]
+        json_file_name = "redistrib_cuda_%s.json" % cuda_version
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            sha256 = sha256,
+            output = json_file_name,
+        )
+        cuda_distributives = repository_ctx.read(repository_ctx.path(json_file_name))
+    if cudnn_version:
+        (url, sha256) = repository_ctx.attr.cudnn_json_dict[cudnn_version]
+        json_file_name = "redistrib_cudnn_%s.json" % cudnn_version
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            sha256 = sha256,
+            output = json_file_name,
+        )
+        cudnn_distributives = repository_ctx.read(repository_ctx.path(json_file_name))
+
+    repository_ctx.file(
+        "build_defs.bzl",
+        """def get_cuda_distributives():
+               return {cuda_distributives}
+
+def get_cudnn_distributives():
+    return {cudnn_distributives}
+""".format(cuda_distributives = cuda_distributives, cudnn_distributives = cudnn_distributives),
+    )
+    repository_ctx.file(
+        "BUILD",
+        "",
+    )
+
+_cuda_redist_json = repository_rule(
+    implementation = _cuda_redist_json_impl,
+    attrs = {
+        "cuda_json_dict": attr.string_list_dict(mandatory = True),
+        "cudnn_json_dict": attr.string_list_dict(mandatory = True),
+    },
+    environ = ["TF_CUDA_VERSION", "TF_CUDNN_VERSION"],
+)
+
+def cuda_redist_json(name, cuda_json_dict, cudnn_json_dict):
+    _cuda_redist_json(
+        name = name,
+        cuda_json_dict = cuda_json_dict,
+        cudnn_json_dict = cudnn_json_dict,
+    )
diff --git a/third_party/xla/third_party/cuda_repo.bzl b/third_party/xla/third_party/cuda_repo.bzl
new file mode 100644
index 00000000000000..0dcda26cba3a23
--- /dev/null
+++ b/third_party/xla/third_party/cuda_repo.bzl
@@ -0,0 +1,327 @@
+# Copyright 2024 The TensorFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for defining Google ML CUDA dependencies."""
+
+load("@cuda_redist_json//:build_defs.bzl", "get_cuda_distributives", "get_cudnn_distributives")
+load("//third_party:repo.bzl", "tf_mirror_urls")
+
+_DEFAULT_CUDA_VERSION = "12.3.2"
+_OS_ARCH_DICT = {
+    "amd64": "x86_64-unknown-linux-gnu",
+    "aarch64": "aarch64-unknown-linux-gnu",
+}
+_REDIST_ARCH_DICT = {
+    "linux-x86_64": "x86_64-unknown-linux-gnu",
+    "linux-sbsa": "aarch64-unknown-linux-gnu",
+}
+
+_CUDA_DIST_PATH_PREFIX = "https://developer.download.nvidia.com/compute/cuda/redist/"
+_CUDNN_DIST_PATH_PREFIX = "https://developer.download.nvidia.com/compute/cudnn/redist/"
+
+def _get_env_var(ctx, name):
+    if name in ctx.os.environ:
+        return ctx.os.environ[name]
+    else:
+        return None
+
+def _get_archive_name(url, archive_suffix = ".tar.xz"):
+    last_slash_index = url.rfind("/")
+    return url[last_slash_index + 1:-len(archive_suffix)]
+
+def _cuda_http_archive_impl(repository_ctx):
+    cuda_or_cudnn_version = None
+    dist_version = ""
+    cuda_version = _get_env_var(repository_ctx, "TF_CUDA_VERSION")
+    cudnn_version = _get_env_var(repository_ctx, "TF_CUDNN_VERSION")
+    if repository_ctx.attr.is_cudnn_dist:
+        cuda_or_cudnn_version = cudnn_version
+    else:
+        cuda_or_cudnn_version = cuda_version
+    if cuda_or_cudnn_version:
+        # Download archive only when GPU config is used.
+        dist_version = repository_ctx.attr.dist_version
+        arch = _OS_ARCH_DICT[repository_ctx.os.arch]
+        if arch not in repository_ctx.attr.relative_url_dict.keys():
+            (relative_url, sha256) = repository_ctx.attr.relative_url_dict["cuda{version}_{arch}" \
+                .format(
+                version = cuda_version.split(".")[0],
+                arch = arch,
+            )]
+        else:
+            (relative_url, sha256) = repository_ctx.attr.relative_url_dict[arch]
+        url = (repository_ctx.attr.cudnn_dist_path_prefix if repository_ctx.attr.is_cudnn_dist else repository_ctx.attr.cuda_dist_path_prefix) + relative_url
+
+        archive_name = _get_archive_name(url)
+
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            output = archive_name + repository_ctx.attr.extension,
+            sha256 = sha256,
+        )
+        repository_ctx.extract(
+            archive = archive_name + repository_ctx.attr.extension,
+            stripPrefix = repository_ctx.attr.override_strip_prefix if repository_ctx.attr.override_strip_prefix else archive_name,
+        )
+    if repository_ctx.attr.build_template:
+        version = dist_version.split(".")[0] if dist_version else ""
+        repository_ctx.file("version.txt", version)
+        repository_ctx.template(
+            "BUILD",
+            repository_ctx.attr.build_template,
+            {"%{version}": version},
+        )
+    else:
+        repository_ctx.file(
+            "BUILD",
+            repository_ctx.read(repository_ctx.attr.build_file),
+        )
+
+_cuda_http_archive = repository_rule(
+    implementation = _cuda_http_archive_impl,
+    attrs = {
+        "dist_version": attr.string(mandatory = True),
+        "relative_url_dict": attr.string_list_dict(mandatory = True),
+        "build_template": attr.label(),
+        "build_file": attr.label(),
+        "is_cudnn_dist": attr.bool(),
+        "override_strip_prefix": attr.string(),
+        "cudnn_dist_path_prefix": attr.string(default = _CUDNN_DIST_PATH_PREFIX),
+        "cuda_dist_path_prefix": attr.string(default = _CUDA_DIST_PATH_PREFIX),
+        "extension": attr.string(default = ".tar.xz"),
+    },
+    environ = ["TF_CUDA_VERSION", "TF_CUDNN_VERSION"],
+)
+
+def cuda_http_archive(name, dist_version, relative_url_dict, **kwargs):
+    _cuda_http_archive(
+        name = name,
+        dist_version = dist_version,
+        relative_url_dict = relative_url_dict,
+        **kwargs
+    )
+
+def _cuda_wheel_impl(repository_ctx):
+    cuda_version = _get_env_var(repository_ctx, "TF_CUDA_VERSION")
+    if cuda_version in ["12", "12.3"]:
+        cuda_version = _DEFAULT_CUDA_VERSION
+    if cuda_version:
+        # Download archive only when GPU config is used.
+        arch = _OS_ARCH_DICT[repository_ctx.os.arch]
+        dict_key = "{cuda_version}-{arch}".format(
+            cuda_version = cuda_version,
+            arch = arch,
+        )
+        supported_versions = repository_ctx.attr.url_dict.keys()
+        if dict_key not in supported_versions:
+            fail(
+                ("The supported NCCL versions are {supported_versions}." +
+                 " Please provide a supported CUDA version in TF_CUDA_VERSION" +
+                 " environment variable or add NCCL distributive for" +
+                 " CUDA version={version}, OS={arch}.")
+                    .format(
+                    supported_versions = supported_versions,
+                    version = cuda_version,
+                    arch = arch,
+                ),
+            )
+        sha256 = repository_ctx.attr.sha256_dict[dict_key]
+        url = repository_ctx.attr.url_dict[dict_key]
+
+        archive_name = _get_archive_name(url, archive_suffix = ".whl")
+
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            output = archive_name + repository_ctx.attr.extension,
+            sha256 = sha256,
+        )
+        repository_ctx.extract(
+            archive = archive_name + repository_ctx.attr.extension,
+            stripPrefix = repository_ctx.attr.strip_prefix,
+        )
+
+    repository_ctx.file(
+        "BUILD",
+        repository_ctx.read(repository_ctx.attr.build_file),
+    )
+
+_cuda_wheel = repository_rule(
+    implementation = _cuda_wheel_impl,
+    attrs = {
+        "sha256_dict": attr.string_dict(mandatory = True),
+        "url_dict": attr.string_dict(mandatory = True),
+        "build_file": attr.label(),
+        "strip_prefix": attr.string(),
+        "extension": attr.string(default = ".zip"),
+    },
+    environ = ["TF_CUDA_VERSION"],
+)
+
+def cuda_wheel(name, sha256_dict, url_dict, **kwargs):
+    _cuda_wheel(
+        name = name,
+        sha256_dict = sha256_dict,
+        url_dict = url_dict,
+        **kwargs
+    )
+
+def _get_relative_url_dict(dist_info):
+    relative_url_dict = {}
+    for arch in _REDIST_ARCH_DICT.keys():
+        # CUDNN JSON might contain paths for each CUDA version.
+        if "relative_path" not in dist_info[arch]:
+            for cuda_version, data in dist_info[arch].items():
+                relative_url_dict["{cuda_version}_{arch}" \
+                    .format(
+                    cuda_version = cuda_version,
+                    arch = _REDIST_ARCH_DICT[arch],
+                )] = [data["relative_path"], data["sha256"]]
+        else:
+            relative_url_dict[_REDIST_ARCH_DICT[arch]] = [
+                dist_info[arch]["relative_path"],
+                dist_info[arch]["sha256"],
+            ]
+    return relative_url_dict
+
+def _get_cuda_archive(
+        repo_name,
+        dist_dict,
+        dist_name,
+        build_file = None,
+        build_template = None,
+        is_cudnn_dist = False):
+    if dist_name in dist_dict.keys():
+        return cuda_http_archive(
+            name = repo_name,
+            dist_version = dist_dict[dist_name]["version"],
+            build_file = Label(build_file) if build_file else None,
+            build_template = Label(build_template) if build_template else None,
+            relative_url_dict = _get_relative_url_dict(dist_dict[dist_name]),
+            is_cudnn_dist = is_cudnn_dist,
+        )
+    else:
+        return cuda_http_archive(
+            name = repo_name,
+            dist_version = "",
+            build_file = Label(build_file) if build_file else None,
+            build_template = Label(build_template) if build_template else None,
+            relative_url_dict = {"": []},
+            is_cudnn_dist = is_cudnn_dist,
+        )
+
+def cuda_distributives(cuda_nccl_wheel_dict):
+    nccl_artifacts_dict = {"sha256_dict": {}, "url_dict": {}}
+    for cuda_version, nccl_wheel_info in cuda_nccl_wheel_dict.items():
+        for arch in _OS_ARCH_DICT.values():
+            if arch in nccl_wheel_info.keys():
+                cuda_version_to_arch_key = "%s-%s" % (cuda_version, arch)
+                nccl_artifacts_dict["sha256_dict"][cuda_version_to_arch_key] = nccl_wheel_info[arch]["sha256"]
+                nccl_artifacts_dict["url_dict"][cuda_version_to_arch_key] = nccl_wheel_info[arch]["url"]
+
+    cuda_wheel(
+        name = "cuda_nccl",
+        sha256_dict = nccl_artifacts_dict["sha256_dict"],
+        url_dict = nccl_artifacts_dict["url_dict"],
+        build_file = Label("//third_party/gpus/cuda:cuda_nccl.BUILD"),
+        strip_prefix = "nvidia/nccl",
+    )
+
+    cuda_distributives = get_cuda_distributives()
+    cudnn_distributives = get_cudnn_distributives()
+
+    _get_cuda_archive(
+        repo_name = "cuda_cccl",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_cccl",
+        build_file = "//third_party/gpus/cuda:cuda_cccl.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cublas",
+        dist_dict = cuda_distributives,
+        dist_name = "libcublas",
+        build_template = "//third_party/gpus/cuda:cuda_cublas.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cudart",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_cudart",
+        build_template = "//third_party/gpus/cuda:cuda_cudart.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cudnn",
+        dist_dict = cudnn_distributives,
+        dist_name = "cudnn",
+        build_template = "//third_party/gpus/cuda:cuda_cudnn.BUILD.tpl",
+        is_cudnn_dist = True,
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cufft",
+        dist_dict = cuda_distributives,
+        dist_name = "libcufft",
+        build_template = "//third_party/gpus/cuda:cuda_cufft.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cupti",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_cupti",
+        build_template = "//third_party/gpus/cuda:cuda_cupti.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_curand",
+        dist_dict = cuda_distributives,
+        dist_name = "libcurand",
+        build_template = "//third_party/gpus/cuda:cuda_curand.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cusolver",
+        dist_dict = cuda_distributives,
+        dist_name = "libcusolver",
+        build_template = "//third_party/gpus/cuda:cuda_cusolver.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cusparse",
+        dist_dict = cuda_distributives,
+        dist_name = "libcusparse",
+        build_template = "//third_party/gpus/cuda:cuda_cusparse.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvcc",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvcc",
+        build_file = "//third_party/gpus/cuda:cuda_nvcc.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvjitlink",
+        dist_dict = cuda_distributives,
+        dist_name = "libnvjitlink",
+        build_template = "//third_party/gpus/cuda:cuda_nvjitlink.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvml",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvml_dev",
+        build_file = "//third_party/gpus/cuda:cuda_nvml.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvprune",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvprune",
+        build_file = "//third_party/gpus/cuda:cuda_nvprune.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvtx",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvtx",
+        build_file = "//third_party/gpus/cuda:cuda_nvtx.BUILD",
+    )
diff --git a/third_party/xla/third_party/tsl/.bazelrc b/third_party/xla/third_party/tsl/.bazelrc
index 02dec0349c4741..c17ae4494dc99c 100644
--- a/third_party/xla/third_party/tsl/.bazelrc
+++ b/third_party/xla/third_party/tsl/.bazelrc
@@ -226,12 +226,13 @@ build:cuda --repo_env TF_NEED_CUDA=1
 build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda --@local_config_cuda//:enable_cuda
 
+build:no_cuda_libs --@local_config_cuda//cuda:include_hermetic_cuda_libs=false
+
 # CUDA: This config refers to building CUDA op kernels with clang.
 build:cuda_clang --config=cuda
-# Enable TensorRT optimizations https://developer.nvidia.com/tensorrt
-build:cuda_clang --config=tensorrt
 build:cuda_clang --action_env=TF_CUDA_CLANG="1"
 build:cuda_clang --@local_config_cuda//:cuda_compiler=clang
+build:cuda_clang --copt=-Qunused-arguments
 # Select supported compute capabilities (supported graphics cards).
 # This is the same as the official TensorFlow builds.
 # See https://developer.nvidia.com/cuda-gpus#compute
@@ -244,12 +245,10 @@ build:cuda_clang --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_60,sm_70,sm_80,sm_8
 
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 build:cuda_clang_official --config=cuda_clang
-build:cuda_clang_official --action_env=TF_CUDA_VERSION="12"
-build:cuda_clang_official --action_env=TF_CUDNN_VERSION="8"
-build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.3"
+build:cuda_clang_official --action_env=TF_CUDA_VERSION="12.3"
+build:cuda_clang_official --action_env=TF_CUDNN_VERSION="8.9"
 build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
-build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 build:cuda_clang_official --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
 
 # Build with nvcc for CUDA and clang for host
@@ -545,10 +544,6 @@ build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
 # For Remote build execution -- GPU configuration
 build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
-build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.17-clang_config_cuda"
-build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.17-clang_config_tensorrt"
-build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.17-clang_config_nccl"
-test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
 build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
 build:rbe_linux_cuda_nvcc --config=nvcc_clang
@@ -633,7 +628,6 @@ build:release_cpu_linux_base --repo_env=BAZEL_COMPILER="/usr/lib/llvm-17/bin/cla
 # Test-related settings below this point.
 test:release_linux_base --build_tests_only --keep_going --test_output=errors --verbose_failures=true
 test:release_linux_base --local_test_jobs=HOST_CPUS
-test:release_linux_base --test_env=LD_LIBRARY_PATH
 # Give only the list of failed tests at the end of the log
 test:release_linux_base --test_summary=short
 
@@ -645,7 +639,6 @@ build:release_gpu_linux --config=release_cpu_linux
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 # Note that linux cpu and cuda builds share the same toolchain now.
 build:release_gpu_linux --config=cuda_clang_official
-test:release_gpu_linux --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 # Local test jobs has to be 4 because parallel_gpu_execute is fragile, I think
 test:release_gpu_linux --test_timeout=300,450,1200,3600 --local_test_jobs=4 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute
 
@@ -674,11 +667,8 @@ test:unsupported_cpu_linux --config=release_base
 build:unsupported_gpu_linux --config=cuda
 build:unsupported_gpu_linux --config=unsupported_cpu_linux
 build:unsupported_gpu_linux --action_env=TF_CUDA_VERSION="11"
-build:unsupported_gpu_linux --action_env=TF_CUDNN_VERSION="8"
+build:unsupported_gpu_linux --action_env=TF_CUDNN_VERSION="8.6"
 build:unsupported_gpu_linux --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_50,sm_60,sm_70,sm_75,compute_80"
-build:unsupported_gpu_linux --config=tensorrt
-build:unsupported_gpu_linux --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-11.2"
-build:unsupported_gpu_linux --action_env=LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.1/lib64:/usr/local/tensorrt/lib"
 build:unsupported_gpu_linux --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:unsupported_gpu_linux --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain
 
diff --git a/third_party/xla/third_party/tsl/opensource_only.files b/third_party/xla/third_party/tsl/opensource_only.files
index 3f2bcf8431edc0..a789348e6f50c7 100644
--- a/third_party/xla/third_party/tsl/opensource_only.files
+++ b/third_party/xla/third_party/tsl/opensource_only.files
@@ -6,6 +6,8 @@ third_party/clang_toolchain/cc_configure_clang.bzl:
 third_party/clang_toolchain/download_clang.bzl:
 third_party/compute_library/BUILD:
 third_party/compute_library/build_defs.bzl:
+third_party/cuda_redist_json_repo.bzl:
+third_party/cuda_repo.bzl:
 third_party/curl.BUILD:
 third_party/cython.BUILD:
 third_party/ducc/BUILD:
@@ -21,6 +23,7 @@ third_party/git/BUILD.tpl:
 third_party/git/BUILD:
 third_party/git/git_configure.bzl:
 third_party/gpus/BUILD:
+third_party/gpus/compiler_common_tools.bzl:
 third_party/gpus/crosstool/BUILD.rocm.tpl:
 third_party/gpus/crosstool/BUILD.sycl.tpl:
 third_party/gpus/crosstool/BUILD.tpl:
@@ -31,15 +34,32 @@ third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl:
 third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_sycl.tpl:
 third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl:
 third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl:
+third_party/gpus/cuda/BUILD.hermetic.tpl:
 third_party/gpus/cuda/BUILD.tpl:
 third_party/gpus/cuda/BUILD.windows.tpl:
 third_party/gpus/cuda/BUILD:
 third_party/gpus/cuda/LICENSE:
 third_party/gpus/cuda/build_defs.bzl.tpl:
+third_party/gpus/cuda/cuda_cccl.BUILD:
 third_party/gpus/cuda/cuda_config.h.tpl:
 third_party/gpus/cuda/cuda_config.py.tpl:
+third_party/gpus/cuda/cuda_cublas.BUILD.tpl:
+third_party/gpus/cuda/cuda_cudart.BUILD.tpl:
+third_party/gpus/cuda/cuda_cudnn.BUILD.tpl:
+third_party/gpus/cuda/cuda_cufft.BUILD.tpl:
+third_party/gpus/cuda/cuda_cupti.BUILD.tpl:
+third_party/gpus/cuda/cuda_curand.BUILD.tpl:
+third_party/gpus/cuda/cuda_cusolver.BUILD.tpl:
+third_party/gpus/cuda/cuda_cusparse.BUILD.tpl:
+third_party/gpus/cuda/cuda_nccl.BUILD:
+third_party/gpus/cuda/cuda_nvcc.BUILD:
+third_party/gpus/cuda/cuda_nvjitlink.BUILD.tpl:
+third_party/gpus/cuda/cuda_nvml.BUILD:
+third_party/gpus/cuda/cuda_nvprune.BUILD:
+third_party/gpus/cuda/cuda_nvtx.BUILD:
 third_party/gpus/cuda_configure.bzl:
 third_party/gpus/find_cuda_config:.py
+third_party/gpus/hermetic_cuda_configure.bzl:
 third_party/gpus/rocm/BUILD.tpl:
 third_party/gpus/rocm/BUILD:
 third_party/gpus/rocm/build_defs.bzl.tpl:
@@ -68,6 +88,7 @@ third_party/nccl/archive.BUILD:
 third_party/nccl/archive.patch:
 third_party/nccl/build_defs.bzl.tpl:
 third_party/nccl/generated_names.bzl.tpl:
+third_party/nccl/hermetic_nccl_configure.bzl:
 third_party/nccl/nccl_configure.bzl:
 third_party/nccl/system.BUILD.tpl:
 third_party/nvtx/BUILD:
diff --git a/third_party/xla/third_party/tsl/third_party/cuda_redist_json_repo.bzl b/third_party/xla/third_party/tsl/third_party/cuda_redist_json_repo.bzl
new file mode 100644
index 00000000000000..76941dd74c9488
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/cuda_redist_json_repo.bzl
@@ -0,0 +1,110 @@
+# Copyright 2024 The TensorFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for defining CUDA and cuDNN JSON files with distributives versions."""
+
+load("//third_party:repo.bzl", "tf_mirror_urls")
+
+_DEFAULT_CUDA_VERSION = "12.3.2"
+_DEFAULT_CUDNN_VERSION = "8.9.7.29"
+
+def _get_env_var(ctx, name):
+    if name in ctx.os.environ:
+        return ctx.os.environ[name]
+    else:
+        return None
+
+def _cuda_redist_json_impl(repository_ctx):
+    cuda_version = _get_env_var(repository_ctx, "TF_CUDA_VERSION")
+    cudnn_version = _get_env_var(repository_ctx, "TF_CUDNN_VERSION")
+    supported_cuda_versions = repository_ctx.attr.cuda_json_dict.keys()
+    if cuda_version and (cuda_version not in supported_cuda_versions):
+        if cuda_version in ["12", "12.3"]:
+            cuda_version = _DEFAULT_CUDA_VERSION
+        else:
+            fail(
+                ("The supported CUDA versions are {supported_versions}." +
+                 " Please provide a supported version in TF_CUDA_VERSION" +
+                 " environment variable or add JSON URL for" +
+                 " CUDA version={version}.")
+                    .format(
+                    supported_versions = supported_cuda_versions,
+                    version = cuda_version,
+                ),
+            )
+    supported_cudnn_versions = repository_ctx.attr.cudnn_json_dict.keys()
+    if cudnn_version and (cudnn_version not in supported_cudnn_versions):
+        if cudnn_version in ["8", "8.9"]:
+            cudnn_version = _DEFAULT_CUDNN_VERSION
+        else:
+            fail(
+                ("The supported CUDNN versions are {supported_versions}." +
+                 " Please provide a supported version in TF_CUDNN_VERSION" +
+                 " environment variable or add JSON URL for" +
+                 " CUDNN version={version}.")
+                    .format(
+                    supported_versions = supported_cudnn_versions,
+                    version = cudnn_version,
+                ),
+            )
+    cuda_distributives = "{}"
+    cudnn_distributives = "{}"
+    if cuda_version:
+        (url, sha256) = repository_ctx.attr.cuda_json_dict[cuda_version]
+        json_file_name = "redistrib_cuda_%s.json" % cuda_version
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            sha256 = sha256,
+            output = json_file_name,
+        )
+        cuda_distributives = repository_ctx.read(repository_ctx.path(json_file_name))
+    if cudnn_version:
+        (url, sha256) = repository_ctx.attr.cudnn_json_dict[cudnn_version]
+        json_file_name = "redistrib_cudnn_%s.json" % cudnn_version
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            sha256 = sha256,
+            output = json_file_name,
+        )
+        cudnn_distributives = repository_ctx.read(repository_ctx.path(json_file_name))
+
+    repository_ctx.file(
+        "build_defs.bzl",
+        """def get_cuda_distributives():
+               return {cuda_distributives}
+
+def get_cudnn_distributives():
+    return {cudnn_distributives}
+""".format(cuda_distributives = cuda_distributives, cudnn_distributives = cudnn_distributives),
+    )
+    repository_ctx.file(
+        "BUILD",
+        "",
+    )
+
+_cuda_redist_json = repository_rule(
+    implementation = _cuda_redist_json_impl,
+    attrs = {
+        "cuda_json_dict": attr.string_list_dict(mandatory = True),
+        "cudnn_json_dict": attr.string_list_dict(mandatory = True),
+    },
+    environ = ["TF_CUDA_VERSION", "TF_CUDNN_VERSION"],
+)
+
+def cuda_redist_json(name, cuda_json_dict, cudnn_json_dict):
+    _cuda_redist_json(
+        name = name,
+        cuda_json_dict = cuda_json_dict,
+        cudnn_json_dict = cudnn_json_dict,
+    )
diff --git a/third_party/xla/third_party/tsl/third_party/cuda_repo.bzl b/third_party/xla/third_party/tsl/third_party/cuda_repo.bzl
new file mode 100644
index 00000000000000..0dcda26cba3a23
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/cuda_repo.bzl
@@ -0,0 +1,327 @@
+# Copyright 2024 The TensorFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for defining Google ML CUDA dependencies."""
+
+load("@cuda_redist_json//:build_defs.bzl", "get_cuda_distributives", "get_cudnn_distributives")
+load("//third_party:repo.bzl", "tf_mirror_urls")
+
+_DEFAULT_CUDA_VERSION = "12.3.2"
+_OS_ARCH_DICT = {
+    "amd64": "x86_64-unknown-linux-gnu",
+    "aarch64": "aarch64-unknown-linux-gnu",
+}
+_REDIST_ARCH_DICT = {
+    "linux-x86_64": "x86_64-unknown-linux-gnu",
+    "linux-sbsa": "aarch64-unknown-linux-gnu",
+}
+
+_CUDA_DIST_PATH_PREFIX = "https://developer.download.nvidia.com/compute/cuda/redist/"
+_CUDNN_DIST_PATH_PREFIX = "https://developer.download.nvidia.com/compute/cudnn/redist/"
+
+def _get_env_var(ctx, name):
+    if name in ctx.os.environ:
+        return ctx.os.environ[name]
+    else:
+        return None
+
+def _get_archive_name(url, archive_suffix = ".tar.xz"):
+    last_slash_index = url.rfind("/")
+    return url[last_slash_index + 1:-len(archive_suffix)]
+
+def _cuda_http_archive_impl(repository_ctx):
+    cuda_or_cudnn_version = None
+    dist_version = ""
+    cuda_version = _get_env_var(repository_ctx, "TF_CUDA_VERSION")
+    cudnn_version = _get_env_var(repository_ctx, "TF_CUDNN_VERSION")
+    if repository_ctx.attr.is_cudnn_dist:
+        cuda_or_cudnn_version = cudnn_version
+    else:
+        cuda_or_cudnn_version = cuda_version
+    if cuda_or_cudnn_version:
+        # Download archive only when GPU config is used.
+        dist_version = repository_ctx.attr.dist_version
+        arch = _OS_ARCH_DICT[repository_ctx.os.arch]
+        if arch not in repository_ctx.attr.relative_url_dict.keys():
+            (relative_url, sha256) = repository_ctx.attr.relative_url_dict["cuda{version}_{arch}" \
+                .format(
+                version = cuda_version.split(".")[0],
+                arch = arch,
+            )]
+        else:
+            (relative_url, sha256) = repository_ctx.attr.relative_url_dict[arch]
+        url = (repository_ctx.attr.cudnn_dist_path_prefix if repository_ctx.attr.is_cudnn_dist else repository_ctx.attr.cuda_dist_path_prefix) + relative_url
+
+        archive_name = _get_archive_name(url)
+
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            output = archive_name + repository_ctx.attr.extension,
+            sha256 = sha256,
+        )
+        repository_ctx.extract(
+            archive = archive_name + repository_ctx.attr.extension,
+            stripPrefix = repository_ctx.attr.override_strip_prefix if repository_ctx.attr.override_strip_prefix else archive_name,
+        )
+    if repository_ctx.attr.build_template:
+        version = dist_version.split(".")[0] if dist_version else ""
+        repository_ctx.file("version.txt", version)
+        repository_ctx.template(
+            "BUILD",
+            repository_ctx.attr.build_template,
+            {"%{version}": version},
+        )
+    else:
+        repository_ctx.file(
+            "BUILD",
+            repository_ctx.read(repository_ctx.attr.build_file),
+        )
+
+_cuda_http_archive = repository_rule(
+    implementation = _cuda_http_archive_impl,
+    attrs = {
+        "dist_version": attr.string(mandatory = True),
+        "relative_url_dict": attr.string_list_dict(mandatory = True),
+        "build_template": attr.label(),
+        "build_file": attr.label(),
+        "is_cudnn_dist": attr.bool(),
+        "override_strip_prefix": attr.string(),
+        "cudnn_dist_path_prefix": attr.string(default = _CUDNN_DIST_PATH_PREFIX),
+        "cuda_dist_path_prefix": attr.string(default = _CUDA_DIST_PATH_PREFIX),
+        "extension": attr.string(default = ".tar.xz"),
+    },
+    environ = ["TF_CUDA_VERSION", "TF_CUDNN_VERSION"],
+)
+
+def cuda_http_archive(name, dist_version, relative_url_dict, **kwargs):
+    _cuda_http_archive(
+        name = name,
+        dist_version = dist_version,
+        relative_url_dict = relative_url_dict,
+        **kwargs
+    )
+
+def _cuda_wheel_impl(repository_ctx):
+    cuda_version = _get_env_var(repository_ctx, "TF_CUDA_VERSION")
+    if cuda_version in ["12", "12.3"]:
+        cuda_version = _DEFAULT_CUDA_VERSION
+    if cuda_version:
+        # Download archive only when GPU config is used.
+        arch = _OS_ARCH_DICT[repository_ctx.os.arch]
+        dict_key = "{cuda_version}-{arch}".format(
+            cuda_version = cuda_version,
+            arch = arch,
+        )
+        supported_versions = repository_ctx.attr.url_dict.keys()
+        if dict_key not in supported_versions:
+            fail(
+                ("The supported NCCL versions are {supported_versions}." +
+                 " Please provide a supported CUDA version in TF_CUDA_VERSION" +
+                 " environment variable or add NCCL distributive for" +
+                 " CUDA version={version}, OS={arch}.")
+                    .format(
+                    supported_versions = supported_versions,
+                    version = cuda_version,
+                    arch = arch,
+                ),
+            )
+        sha256 = repository_ctx.attr.sha256_dict[dict_key]
+        url = repository_ctx.attr.url_dict[dict_key]
+
+        archive_name = _get_archive_name(url, archive_suffix = ".whl")
+
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            output = archive_name + repository_ctx.attr.extension,
+            sha256 = sha256,
+        )
+        repository_ctx.extract(
+            archive = archive_name + repository_ctx.attr.extension,
+            stripPrefix = repository_ctx.attr.strip_prefix,
+        )
+
+    repository_ctx.file(
+        "BUILD",
+        repository_ctx.read(repository_ctx.attr.build_file),
+    )
+
+_cuda_wheel = repository_rule(
+    implementation = _cuda_wheel_impl,
+    attrs = {
+        "sha256_dict": attr.string_dict(mandatory = True),
+        "url_dict": attr.string_dict(mandatory = True),
+        "build_file": attr.label(),
+        "strip_prefix": attr.string(),
+        "extension": attr.string(default = ".zip"),
+    },
+    environ = ["TF_CUDA_VERSION"],
+)
+
+def cuda_wheel(name, sha256_dict, url_dict, **kwargs):
+    _cuda_wheel(
+        name = name,
+        sha256_dict = sha256_dict,
+        url_dict = url_dict,
+        **kwargs
+    )
+
+def _get_relative_url_dict(dist_info):
+    relative_url_dict = {}
+    for arch in _REDIST_ARCH_DICT.keys():
+        # CUDNN JSON might contain paths for each CUDA version.
+        if "relative_path" not in dist_info[arch]:
+            for cuda_version, data in dist_info[arch].items():
+                relative_url_dict["{cuda_version}_{arch}" \
+                    .format(
+                    cuda_version = cuda_version,
+                    arch = _REDIST_ARCH_DICT[arch],
+                )] = [data["relative_path"], data["sha256"]]
+        else:
+            relative_url_dict[_REDIST_ARCH_DICT[arch]] = [
+                dist_info[arch]["relative_path"],
+                dist_info[arch]["sha256"],
+            ]
+    return relative_url_dict
+
+def _get_cuda_archive(
+        repo_name,
+        dist_dict,
+        dist_name,
+        build_file = None,
+        build_template = None,
+        is_cudnn_dist = False):
+    if dist_name in dist_dict.keys():
+        return cuda_http_archive(
+            name = repo_name,
+            dist_version = dist_dict[dist_name]["version"],
+            build_file = Label(build_file) if build_file else None,
+            build_template = Label(build_template) if build_template else None,
+            relative_url_dict = _get_relative_url_dict(dist_dict[dist_name]),
+            is_cudnn_dist = is_cudnn_dist,
+        )
+    else:
+        return cuda_http_archive(
+            name = repo_name,
+            dist_version = "",
+            build_file = Label(build_file) if build_file else None,
+            build_template = Label(build_template) if build_template else None,
+            relative_url_dict = {"": []},
+            is_cudnn_dist = is_cudnn_dist,
+        )
+
+def cuda_distributives(cuda_nccl_wheel_dict):
+    nccl_artifacts_dict = {"sha256_dict": {}, "url_dict": {}}
+    for cuda_version, nccl_wheel_info in cuda_nccl_wheel_dict.items():
+        for arch in _OS_ARCH_DICT.values():
+            if arch in nccl_wheel_info.keys():
+                cuda_version_to_arch_key = "%s-%s" % (cuda_version, arch)
+                nccl_artifacts_dict["sha256_dict"][cuda_version_to_arch_key] = nccl_wheel_info[arch]["sha256"]
+                nccl_artifacts_dict["url_dict"][cuda_version_to_arch_key] = nccl_wheel_info[arch]["url"]
+
+    cuda_wheel(
+        name = "cuda_nccl",
+        sha256_dict = nccl_artifacts_dict["sha256_dict"],
+        url_dict = nccl_artifacts_dict["url_dict"],
+        build_file = Label("//third_party/gpus/cuda:cuda_nccl.BUILD"),
+        strip_prefix = "nvidia/nccl",
+    )
+
+    cuda_distributives = get_cuda_distributives()
+    cudnn_distributives = get_cudnn_distributives()
+
+    _get_cuda_archive(
+        repo_name = "cuda_cccl",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_cccl",
+        build_file = "//third_party/gpus/cuda:cuda_cccl.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cublas",
+        dist_dict = cuda_distributives,
+        dist_name = "libcublas",
+        build_template = "//third_party/gpus/cuda:cuda_cublas.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cudart",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_cudart",
+        build_template = "//third_party/gpus/cuda:cuda_cudart.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cudnn",
+        dist_dict = cudnn_distributives,
+        dist_name = "cudnn",
+        build_template = "//third_party/gpus/cuda:cuda_cudnn.BUILD.tpl",
+        is_cudnn_dist = True,
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cufft",
+        dist_dict = cuda_distributives,
+        dist_name = "libcufft",
+        build_template = "//third_party/gpus/cuda:cuda_cufft.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cupti",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_cupti",
+        build_template = "//third_party/gpus/cuda:cuda_cupti.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_curand",
+        dist_dict = cuda_distributives,
+        dist_name = "libcurand",
+        build_template = "//third_party/gpus/cuda:cuda_curand.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cusolver",
+        dist_dict = cuda_distributives,
+        dist_name = "libcusolver",
+        build_template = "//third_party/gpus/cuda:cuda_cusolver.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cusparse",
+        dist_dict = cuda_distributives,
+        dist_name = "libcusparse",
+        build_template = "//third_party/gpus/cuda:cuda_cusparse.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvcc",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvcc",
+        build_file = "//third_party/gpus/cuda:cuda_nvcc.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvjitlink",
+        dist_dict = cuda_distributives,
+        dist_name = "libnvjitlink",
+        build_template = "//third_party/gpus/cuda:cuda_nvjitlink.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvml",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvml_dev",
+        build_file = "//third_party/gpus/cuda:cuda_nvml.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvprune",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvprune",
+        build_file = "//third_party/gpus/cuda:cuda_nvprune.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvtx",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvtx",
+        build_file = "//third_party/gpus/cuda:cuda_nvtx.BUILD",
+    )
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/compiler_common_tools.bzl b/third_party/xla/third_party/tsl/third_party/gpus/compiler_common_tools.bzl
new file mode 100644
index 00000000000000..bd07f49ec457bb
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/compiler_common_tools.bzl
@@ -0,0 +1,174 @@
+"""Common compiler functions. """
+
+load(
+    "//third_party/remote_config:common.bzl",
+    "err_out",
+    "raw_exec",
+    "realpath",
+)
+
+def to_list_of_strings(elements):
+    """Convert the list of ["a", "b", "c"] into '"a", "b", "c"'.
+
+    This is to be used to put a list of strings into the bzl file templates
+    so it gets interpreted as list of strings in Starlark.
+
+    Args:
+      elements: list of string elements
+
+    Returns:
+      single string of elements wrapped in quotes separated by a comma."""
+    quoted_strings = ["\"" + element + "\"" for element in elements]
+    return ", ".join(quoted_strings)
+
+_INC_DIR_MARKER_BEGIN = "#include <...>"
+
+# OSX add " (framework directory)" at the end of line, strip it.
+_OSX_FRAMEWORK_SUFFIX = " (framework directory)"
+_OSX_FRAMEWORK_SUFFIX_LEN = len(_OSX_FRAMEWORK_SUFFIX)
+
+# TODO(dzc): Once these functions have been factored out of Bazel's
+# cc_configure.bzl, load them from @bazel_tools instead.
+def _cxx_inc_convert(path):
+    """Convert path returned by cc -E xc++ in a complete path."""
+    path = path.strip()
+    if path.endswith(_OSX_FRAMEWORK_SUFFIX):
+        path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
+    return path
+
+def _normalize_include_path(repository_ctx, path):
+    """Normalizes include paths before writing them to the crosstool.
+
+      If path points inside the 'crosstool' folder of the repository, a relative
+      path is returned.
+      If path points outside the 'crosstool' folder, an absolute path is returned.
+    """
+    path = str(repository_ctx.path(path))
+    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
+
+    if path.startswith(crosstool_folder):
+        # We drop the path to "$REPO/crosstool" and a trailing path separator.
+        return path[len(crosstool_folder) + 1:]
+    return path
+
+def _is_compiler_option_supported(repository_ctx, cc, option):
+    """Checks that `option` is supported by the C compiler. Doesn't %-escape the option."""
+    result = repository_ctx.execute([
+        cc,
+        option,
+        "-o",
+        "/dev/null",
+        "-c",
+        str(repository_ctx.path("tools/cpp/empty.cc")),
+    ])
+    return result.stderr.find(option) == -1
+
+def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp, tf_sys_root):
+    """Compute the list of default C or C++ include directories."""
+    if lang_is_cpp:
+        lang = "c++"
+    else:
+        lang = "c"
+    sysroot = []
+    if tf_sys_root:
+        sysroot += ["--sysroot", tf_sys_root]
+    result = raw_exec(repository_ctx, [cc, "-E", "-x" + lang, "-", "-v"] +
+                                      sysroot)
+    stderr = err_out(result)
+    index1 = stderr.find(_INC_DIR_MARKER_BEGIN)
+    if index1 == -1:
+        return []
+    index1 = stderr.find("\n", index1)
+    if index1 == -1:
+        return []
+    index2 = stderr.rfind("\n ")
+    if index2 == -1 or index2 < index1:
+        return []
+    index2 = stderr.find("\n", index2 + 1)
+    if index2 == -1:
+        inc_dirs = stderr[index1 + 1:]
+    else:
+        inc_dirs = stderr[index1 + 1:index2].strip()
+
+    print_resource_dir_supported = _is_compiler_option_supported(
+        repository_ctx,
+        cc,
+        "-print-resource-dir",
+    )
+
+    if print_resource_dir_supported:
+        resource_dir = repository_ctx.execute(
+            [cc, "-print-resource-dir"],
+        ).stdout.strip() + "/share"
+        inc_dirs += "\n" + resource_dir
+
+    compiler_includes = [
+        _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
+        for p in inc_dirs.split("\n")
+    ]
+
+    # The compiler might be on a symlink, e.g. /symlink -> /opt/gcc
+    # The above keeps only the resolved paths to the default includes (e.g. /opt/gcc/include/c++/11)
+    # but Bazel might encounter either (usually reported by the compiler)
+    # especially when a compiler wrapper (e.g. ccache) is used.
+    # So we need to also include paths where symlinks are not resolved.
+
+    # Try to find real path to CC installation to "see through" compiler wrappers
+    # GCC has the path to g++
+    index1 = result.stderr.find("COLLECT_GCC=")
+    if index1 != -1:
+        index1 = result.stderr.find("=", index1)
+        index2 = result.stderr.find("\n", index1)
+        cc_topdir = repository_ctx.path(result.stderr[index1 + 1:index2]).dirname.dirname
+    else:
+        # Clang has the directory
+        index1 = result.stderr.find("InstalledDir: ")
+        if index1 != -1:
+            index1 = result.stderr.find(" ", index1)
+            index2 = result.stderr.find("\n", index1)
+            cc_topdir = repository_ctx.path(result.stderr[index1 + 1:index2]).dirname
+        else:
+            # Fallback to the CC path
+            cc_topdir = repository_ctx.path(cc).dirname.dirname
+
+    # We now have the compiler installation prefix, e.g. /symlink/gcc
+    # And the resolved installation prefix, e.g. /opt/gcc
+    cc_topdir_resolved = str(realpath(repository_ctx, cc_topdir)).strip()
+    cc_topdir = str(cc_topdir).strip()
+
+    # If there is (any!) symlink involved we add paths where the unresolved installation prefix is kept.
+    # e.g. [/opt/gcc/include/c++/11, /opt/gcc/lib/gcc/x86_64-linux-gnu/11/include, /other/path]
+    # adds [/symlink/include/c++/11, /symlink/lib/gcc/x86_64-linux-gnu/11/include]
+    if cc_topdir_resolved != cc_topdir:
+        unresolved_compiler_includes = [
+            cc_topdir + inc[len(cc_topdir_resolved):]
+            for inc in compiler_includes
+            if inc.startswith(cc_topdir_resolved)
+        ]
+        compiler_includes = compiler_includes + unresolved_compiler_includes
+    return compiler_includes
+
+def get_cxx_inc_directories(repository_ctx, cc, tf_sys_root):
+    """Compute the list of default C and C++ include directories."""
+
+    # For some reason `clang -xc` sometimes returns include paths that are
+    # different from the ones from `clang -xc++`. (Symlink and a dir)
+    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
+    includes_cpp = _get_cxx_inc_directories_impl(
+        repository_ctx,
+        cc,
+        True,
+        tf_sys_root,
+    )
+    includes_c = _get_cxx_inc_directories_impl(
+        repository_ctx,
+        cc,
+        False,
+        tf_sys_root,
+    )
+
+    return includes_cpp + [
+        inc
+        for inc in includes_c
+        if inc not in includes_cpp
+    ]
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/crosstool/BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/crosstool/BUILD.tpl
index 8eda7a1cf6ac2b..b9553d9b99ecfe 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/crosstool/BUILD.tpl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/crosstool/BUILD.tpl
@@ -2,6 +2,7 @@
 # Update cuda_configure.bzl#verify_build_defines when adding new variables.
 
 load(":cc_toolchain_config.bzl", "cc_toolchain_config")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
 licenses(["restricted"])
 
@@ -133,9 +134,17 @@ filegroup(
     srcs = [],
 )
 
+filegroup(
+    name = "cuda_nvcc_files",
+    srcs = %{cuda_nvcc_files},
+)
+
 filegroup(
     name = "crosstool_wrapper_driver_is_not_gcc",
-    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
+    srcs = [
+      ":cuda_nvcc_files",
+      ":clang/bin/crosstool_wrapper_driver_is_not_gcc"
+    ],
 )
 
 filegroup(
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/BUILD.hermetic.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/BUILD.hermetic.tpl
new file mode 100644
index 00000000000000..1c00f1c5e32916
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/BUILD.hermetic.tpl
@@ -0,0 +1,291 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("@bazel_skylib//lib:selects.bzl", "selects")
+load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
+
+licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+
+package(default_visibility = ["//visibility:public"])
+
+# Config setting whether TensorFlow is built with CUDA support using clang.
+#
+# TODO(b/174244321), DEPRECATED: this target will be removed when all users
+# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_clang.
+selects.config_setting_group(
+    name = "using_clang",
+    match_all = [
+        "@local_config_cuda//:is_cuda_enabled",
+        "@local_config_cuda//:is_cuda_compiler_clang",
+    ],
+)
+
+# Config setting whether TensorFlow is built with CUDA support using nvcc.
+#
+# TODO(b/174244321), DEPRECATED: this target will be removed when all users
+# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_nvcc.
+selects.config_setting_group(
+    name = "using_nvcc",
+    match_all = [
+        "@local_config_cuda//:is_cuda_enabled",
+        "@local_config_cuda//:is_cuda_compiler_nvcc",
+    ],
+)
+
+# Equivalent to using_clang && -c opt.
+selects.config_setting_group(
+    name = "using_clang_opt",
+    match_all = [
+        ":using_clang",
+        ":_opt",
+    ],
+)
+
+config_setting(
+    name = "_opt",
+    values = {"compilation_mode": "opt"},
+)
+
+# Provides CUDA headers for '#include "third_party/gpus/cuda/include/cuda.h"'
+# All clients including TensorFlow should use these directives.
+cc_library(
+    name = "cuda_headers",
+    hdrs = [
+        "cuda/cuda_config.h",
+    ],
+    include_prefix = "third_party/gpus",
+    includes = [
+        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
+    ],
+    deps = [":cudart_headers",
+            ":cublas_headers",
+            ":cccl_headers",
+            ":nvtx_headers",
+            ":nvcc_headers",
+            ":nvjitlink_headers",
+            ":cusolver_headers",
+            ":cufft_headers",
+            ":cusparse_headers",
+            ":curand_headers",
+            ":cupti_headers",
+            ":nvml_headers"],
+)
+
+cc_library(
+    name = "cudart_static",
+    srcs = ["@cuda_cudart//:static"],
+    linkopts = [
+        "-ldl",
+        "-lpthread",
+        %{cudart_static_linkopt}
+    ],
+)
+
+alias(
+  name = "cuda_driver",
+  actual = "@cuda_cudart//:cuda_driver",
+)
+
+alias(
+  name = "cudart_headers",
+  actual = "@cuda_cudart//:headers",
+)
+
+alias(
+  name = "cudart",
+  actual = "@cuda_cudart//:cudart",
+)
+
+alias(
+  name = "nvjitlink_headers",
+  actual = "@cuda_nvjitlink//:headers",
+)
+
+alias(
+  name = "nvjitlink",
+  actual = "@cuda_nvjitlink//:nvjitlink",
+)
+
+alias(
+  name = "nvtx_headers",
+  actual = "@cuda_nvtx//:headers",
+)
+
+alias(
+  name = "nvml_headers",
+  actual = "@cuda_nvml//:headers",
+)
+
+alias(
+  name = "nvcc_headers",
+  actual = "@cuda_nvcc//:headers",
+)
+
+alias(
+  name = "cccl_headers",
+  actual = "@cuda_cccl//:headers",
+)
+
+alias(
+  name = "cublas_headers",
+  actual = "@cuda_cublas//:headers",
+)
+
+alias(
+  name = "cusolver_headers",
+  actual = "@cuda_cusolver//:headers",
+)
+
+alias(
+  name = "cufft_headers",
+  actual = "@cuda_cufft//:headers",
+)
+
+alias(
+  name = "cusparse_headers",
+  actual = "@cuda_cusparse//:headers",
+)
+
+alias(
+  name = "curand_headers",
+  actual = "@cuda_curand//:headers",
+)
+
+alias(
+  name = "cublas",
+  actual = "@cuda_cublas//:cublas",
+)
+
+alias(
+  name = "cublasLt",
+  actual = "@cuda_cublas//:cublasLt",
+)
+
+alias(
+  name = "cusolver",
+  actual = "@cuda_cusolver//:cusolver",
+)
+
+alias(
+  name = "cudnn",
+  actual = "@cuda_cudnn//:cudnn",
+)
+
+alias(
+  name = "cudnn_ops_infer",
+  actual = "@cuda_cudnn//:cudnn_ops_infer",
+)
+
+alias(
+  name = "cudnn_cnn_infer",
+  actual = "@cuda_cudnn//:cudnn_cnn_infer",
+)
+
+alias(
+  name = "cudnn_ops_train",
+  actual = "@cuda_cudnn//:cudnn_ops_train",
+)
+
+alias(
+  name = "cudnn_cnn_train",
+  actual = "@cuda_cudnn//:cudnn_cnn_train",
+)
+
+alias(
+  name = "cudnn_adv_infer",
+  actual = "@cuda_cudnn//:cudnn_adv_infer",
+)
+
+alias(
+  name = "cudnn_adv_train",
+  actual = "@cuda_cudnn//:cudnn_adv_train",
+)
+alias(
+  name = "cudnn_header",
+  actual = "@cuda_cudnn//:headers",
+)
+
+alias(
+  name = "cufft",
+  actual = "@cuda_cufft//:cufft",
+)
+
+alias(
+  name = "curand",
+  actual = "@cuda_curand//:curand",
+)
+
+cc_library(
+    name = "cuda",
+    deps = [
+        ":cublas",
+        ":cublasLt",
+        ":cuda_headers",
+        ":cudart",
+        ":cudnn",
+        ":cufft",
+        ":curand",
+    ],
+)
+
+alias(
+    name = "cub_headers",
+    actual = "%{cub_actual}",
+)
+
+alias(
+  name = "cupti_headers",
+  actual = "@cuda_cupti//:headers",
+)
+
+alias(
+  name = "cupti_dsos",
+  actual = "@cuda_cupti//:cupti",
+)
+
+alias(
+  name = "cusparse",
+  actual = "@cuda_cusparse//:cusparse",
+)
+
+cc_library(
+    name = "libdevice_root",
+    data = ["@cuda_nvcc//:nvvm"],
+)
+
+bzl_library(
+    name = "build_defs_bzl",
+    srcs = ["build_defs.bzl"],
+    deps = [
+        "@bazel_skylib//lib:selects",
+    ],
+)
+
+py_library(
+    name = "cuda_config_py",
+    srcs = ["cuda/cuda_config.py"],
+)
+
+# Config setting whether TensorFlow is built with hermetic CUDA.
+alias(
+    name = "hermetic_cuda_tools",
+    actual = "@local_config_cuda//:is_cuda_enabled",
+)
+
+# Flag indicating if we should include hermetic CUDA libs.
+bool_flag(
+    name = "include_hermetic_cuda_libs",
+    build_setting_default = True,
+)
+
+config_setting(
+    name = "hermetic_cuda_libs",
+    flag_values = {":include_hermetic_cuda_libs": "True"},
+)
+
+selects.config_setting_group(
+    name = "hermetic_cuda_tools_and_libs",
+    match_all = [
+        ":hermetic_cuda_libs",
+        ":hermetic_cuda_tools"
+    ],
+)
+
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/BUILD.tpl
index 90a18b90de048c..a4264cc14890e5 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/BUILD.tpl
@@ -1,6 +1,7 @@
 load(":build_defs.bzl", "cuda_header_library")
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("@bazel_skylib//lib:selects.bzl", "selects")
+load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 
 licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
 
@@ -144,7 +145,6 @@ cc_library(
     name = "cusolver",
     srcs = ["cuda/lib/%{cusolver_lib}"],
     data = ["cuda/lib/%{cusolver_lib}"],
-    linkopts = ["-lgomp"],
     linkstatic = 1,
 )
 
@@ -220,7 +220,6 @@ cc_library(
     name = "cusparse",
     srcs = ["cuda/lib/%{cusparse_lib}"],
     data = ["cuda/lib/%{cusparse_lib}"],
-    linkopts = ["-lgomp"],
     linkstatic = 1,
 )
 
@@ -242,4 +241,29 @@ py_library(
     srcs = ["cuda/cuda_config.py"],
 )
 
+# Config setting whether TensorFlow is built with hermetic CUDA.
+alias(
+    name = "hermetic_cuda_tools",
+    actual = "@local_config_cuda//:is_cuda_enabled",
+)
+
+# Flag indicating if we should include hermetic CUDA libs.
+bool_flag(
+    name = "include_hermetic_cuda_libs",
+    build_setting_default = False,
+)
+
+config_setting(
+    name = "hermetic_cuda_libs",
+    flag_values = {":include_hermetic_cuda_libs": "True"},
+)
+
+selects.config_setting_group(
+    name = "hermetic_cuda_tools_and_libs",
+    match_all = [
+        ":hermetic_cuda_libs",
+        ":hermetic_cuda_tools"
+    ],
+)
+
 %{copy_rules}
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cccl.BUILD b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cccl.BUILD
new file mode 100644
index 00000000000000..9823f1d871ed53
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cccl.BUILD
@@ -0,0 +1,12 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cublas.BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cublas.BUILD.tpl
new file mode 100644
index 00000000000000..d5766c971a50ff
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cublas.BUILD.tpl
@@ -0,0 +1,33 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cublas",
+    hdrs = [":headers"],
+    shared_library = "lib/libcublas.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cublasLt",
+    hdrs = [":headers"],
+    shared_library = "lib/libcublasLt.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = [
+      "include/cublas.h", 
+      "include/cublas_v2.h", 
+      "include/cublas_api.h", 
+      "include/cublasLt.h"
+    ],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cudart.BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cudart.BUILD.tpl
new file mode 100644
index 00000000000000..08655e7819156c
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cudart.BUILD.tpl
@@ -0,0 +1,34 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+filegroup(
+    name = "static",
+    srcs = ["lib/libcudart_static.a"],
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
+
+cc_import(
+    name = "cuda_driver",
+    shared_library = "lib/stubs/libcuda.so",
+)
+
+cc_import(
+    name = "cudart",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudart.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cudnn.BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cudnn.BUILD.tpl
new file mode 100644
index 00000000000000..98da6e69cbe644
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cudnn.BUILD.tpl
@@ -0,0 +1,65 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cudnn",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_ops_infer",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_ops_infer.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_cnn_infer",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_cnn_infer.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_ops_train",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_ops_train.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_cnn_train",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_cnn_train.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_adv_infer",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_adv_infer.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_adv_train",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_adv_train.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cudnn",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cufft.BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cufft.BUILD.tpl
new file mode 100644
index 00000000000000..6836814dc9b622
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cufft.BUILD.tpl
@@ -0,0 +1,23 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cufft",
+    hdrs = [":headers"],
+    shared_library = "lib/libcufft.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cupti.BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cupti.BUILD.tpl
new file mode 100644
index 00000000000000..772386d723649f
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cupti.BUILD.tpl
@@ -0,0 +1,23 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cupti",
+    hdrs = [":headers"],
+    shared_library = "lib/libcupti.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/extras/CUPTI/include",
+    includes = ["include/"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_curand.BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_curand.BUILD.tpl
new file mode 100644
index 00000000000000..c98ded26f4b907
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_curand.BUILD.tpl
@@ -0,0 +1,23 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "curand",
+    hdrs = [":headers"],
+    shared_library = "lib/libcurand.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cusolver.BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cusolver.BUILD.tpl
new file mode 100644
index 00000000000000..6a5f9d9737cfe2
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cusolver.BUILD.tpl
@@ -0,0 +1,25 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cusolver",
+    hdrs = [":headers"],
+    shared_library = "lib/libcusolver.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = [
+      "include/cusolver_common.h", 
+      "include/cusolverDn.h", 
+      "include/cusolverSp.h"
+    ],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cusparse.BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cusparse.BUILD.tpl
new file mode 100644
index 00000000000000..ad5c2b5f0c45c1
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cusparse.BUILD.tpl
@@ -0,0 +1,23 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cusparse",
+    hdrs = [":headers"],
+    shared_library = "lib/libcusparse.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nccl.BUILD b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nccl.BUILD
new file mode 100644
index 00000000000000..440b31c5cb616e
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nccl.BUILD
@@ -0,0 +1,7 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+cc_import(
+    name = "nccl",
+    shared_library = "lib/libnccl.so.2",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvcc.BUILD b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvcc.BUILD
new file mode 100644
index 00000000000000..6cdaca5cc902a0
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvcc.BUILD
@@ -0,0 +1,73 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "bin/nvcc",
+])
+
+filegroup(
+    name = "nvvm",
+    srcs = [
+        "nvvm/libdevice/libdevice.10.bc",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "nvlink",
+    srcs = [
+        "bin/nvlink",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "fatbinary",
+    srcs = [
+        "bin/fatbinary",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "bin2c",
+    srcs = [
+        "bin/bin2c",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "ptxas",
+    srcs = [
+        "bin/ptxas",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "bin",
+    srcs = glob([
+        "bin/**",
+        "nvvm/bin/**",
+    ]),
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "link_stub",
+    srcs = [
+        "bin/crt/link.stub",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvjitlink.BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvjitlink.BUILD.tpl
new file mode 100644
index 00000000000000..6729b7cd1df9c4
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvjitlink.BUILD.tpl
@@ -0,0 +1,23 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "nvjitlink",
+    hdrs = [":headers"],
+    shared_library = "lib/libnvJitLink.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvml.BUILD b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvml.BUILD
new file mode 100644
index 00000000000000..40b97e671cf7de
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvml.BUILD
@@ -0,0 +1,12 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/nvml/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvprune.BUILD b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvprune.BUILD
new file mode 100644
index 00000000000000..986ef0c8f76166
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvprune.BUILD
@@ -0,0 +1,9 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+filegroup(
+    name = "nvprune",
+    srcs = [
+        "bin/nvprune",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvtx.BUILD b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvtx.BUILD
new file mode 100644
index 00000000000000..9823f1d871ed53
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvtx.BUILD
@@ -0,0 +1,12 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl b/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl
index fefbf081c87e1c..b8aad7ed4994ee 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl
@@ -53,6 +53,11 @@ load(
     "realpath",
     "which",
 )
+load(
+    ":compiler_common_tools.bzl",
+    "get_cxx_inc_directories",
+    "to_list_of_strings",
+)
 
 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
 _GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
@@ -67,20 +72,6 @@ _TF_CUDA_CONFIG_REPO = "TF_CUDA_CONFIG_REPO"
 _TF_DOWNLOAD_CLANG = "TF_DOWNLOAD_CLANG"
 _PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
 
-def to_list_of_strings(elements):
-    """Convert the list of ["a", "b", "c"] into '"a", "b", "c"'.
-
-    This is to be used to put a list of strings into the bzl file templates
-    so it gets interpreted as list of strings in Starlark.
-
-    Args:
-      elements: list of string elements
-
-    Returns:
-      single string of elements wrapped in quotes separated by a comma."""
-    quoted_strings = ["\"" + element + "\"" for element in elements]
-    return ", ".join(quoted_strings)
-
 def verify_build_defines(params):
     """Verify all variables that crosstool/BUILD.tpl expects are substituted.
 
@@ -238,156 +229,6 @@ def find_cc(repository_ctx, use_cuda_clang):
               " environment variable").format(target_cc_name, cc_path_envvar))
     return cc
 
-_INC_DIR_MARKER_BEGIN = "#include <...>"
-
-# OSX add " (framework directory)" at the end of line, strip it.
-_OSX_FRAMEWORK_SUFFIX = " (framework directory)"
-_OSX_FRAMEWORK_SUFFIX_LEN = len(_OSX_FRAMEWORK_SUFFIX)
-
-def _cxx_inc_convert(path):
-    """Convert path returned by cc -E xc++ in a complete path."""
-    path = path.strip()
-    if path.endswith(_OSX_FRAMEWORK_SUFFIX):
-        path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
-    return path
-
-def _normalize_include_path(repository_ctx, path):
-    """Normalizes include paths before writing them to the crosstool.
-
-      If path points inside the 'crosstool' folder of the repository, a relative
-      path is returned.
-      If path points outside the 'crosstool' folder, an absolute path is returned.
-      """
-    path = str(repository_ctx.path(path))
-    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
-
-    if path.startswith(crosstool_folder):
-        # We drop the path to "$REPO/crosstool" and a trailing path separator.
-        return path[len(crosstool_folder) + 1:]
-    return path
-
-def _is_compiler_option_supported(repository_ctx, cc, option):
-    """Checks that `option` is supported by the C compiler. Doesn't %-escape the option."""
-    result = repository_ctx.execute([
-        cc,
-        option,
-        "-o",
-        "/dev/null",
-        "-c",
-        str(repository_ctx.path("tools/cpp/empty.cc")),
-    ])
-    return result.stderr.find(option) == -1
-
-def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp, tf_sysroot):
-    """Compute the list of default C or C++ include directories."""
-    if lang_is_cpp:
-        lang = "c++"
-    else:
-        lang = "c"
-    sysroot = []
-    if tf_sysroot:
-        sysroot += ["--sysroot", tf_sysroot]
-    result = raw_exec(repository_ctx, [cc, "-E", "-x" + lang, "-", "-v"] +
-                                      sysroot)
-    stderr = err_out(result)
-    index1 = stderr.find(_INC_DIR_MARKER_BEGIN)
-    if index1 == -1:
-        return []
-    index1 = stderr.find("\n", index1)
-    if index1 == -1:
-        return []
-    index2 = stderr.rfind("\n ")
-    if index2 == -1 or index2 < index1:
-        return []
-    index2 = stderr.find("\n", index2 + 1)
-    if index2 == -1:
-        inc_dirs = stderr[index1 + 1:]
-    else:
-        inc_dirs = stderr[index1 + 1:index2].strip()
-
-    print_resource_dir_supported = _is_compiler_option_supported(
-        repository_ctx,
-        cc,
-        "-print-resource-dir",
-    )
-
-    if print_resource_dir_supported:
-        resource_dir = repository_ctx.execute(
-            [cc, "-print-resource-dir"],
-        ).stdout.strip() + "/share"
-        inc_dirs += "\n" + resource_dir
-
-    compiler_includes = [
-        _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
-        for p in inc_dirs.split("\n")
-    ]
-
-    # The compiler might be on a symlink, e.g. /symlink -> /opt/gcc
-    # The above keeps only the resolved paths to the default includes (e.g. /opt/gcc/include/c++/11)
-    # but Bazel might encounter either (usually reported by the compiler)
-    # especially when a compiler wrapper (e.g. ccache) is used.
-    # So we need to also include paths where symlinks are not resolved.
-
-    # Try to find real path to CC installation to "see through" compiler wrappers
-    # GCC has the path to g++
-    index1 = result.stderr.find("COLLECT_GCC=")
-    if index1 != -1:
-        index1 = result.stderr.find("=", index1)
-        index2 = result.stderr.find("\n", index1)
-        cc_topdir = repository_ctx.path(result.stderr[index1 + 1:index2]).dirname.dirname
-    else:
-        # Clang has the directory
-        index1 = result.stderr.find("InstalledDir: ")
-        if index1 != -1:
-            index1 = result.stderr.find(" ", index1)
-            index2 = result.stderr.find("\n", index1)
-            cc_topdir = repository_ctx.path(result.stderr[index1 + 1:index2]).dirname
-        else:
-            # Fallback to the CC path
-            cc_topdir = repository_ctx.path(cc).dirname.dirname
-
-    # We now have the compiler installation prefix, e.g. /symlink/gcc
-    # And the resolved installation prefix, e.g. /opt/gcc
-    cc_topdir_resolved = str(realpath(repository_ctx, cc_topdir)).strip()
-    cc_topdir = str(cc_topdir).strip()
-
-    # If there is (any!) symlink involved we add paths where the unresolved installation prefix is kept.
-    # e.g. [/opt/gcc/include/c++/11, /opt/gcc/lib/gcc/x86_64-linux-gnu/11/include, /other/path]
-    # adds [/symlink/include/c++/11, /symlink/lib/gcc/x86_64-linux-gnu/11/include]
-    if cc_topdir_resolved != cc_topdir:
-        unresolved_compiler_includes = [
-            cc_topdir + inc[len(cc_topdir_resolved):]
-            for inc in compiler_includes
-            if inc.startswith(cc_topdir_resolved)
-        ]
-        compiler_includes = compiler_includes + unresolved_compiler_includes
-    return compiler_includes
-
-def get_cxx_inc_directories(repository_ctx, cc, tf_sysroot):
-    """Compute the list of default C and C++ include directories."""
-
-    # For some reason `clang -xc` sometimes returns include paths that are
-    # different from the ones from `clang -xc++`. (Symlink and a dir)
-    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
-    includes_cpp = _get_cxx_inc_directories_impl(
-        repository_ctx,
-        cc,
-        True,
-        tf_sysroot,
-    )
-    includes_c = _get_cxx_inc_directories_impl(
-        repository_ctx,
-        cc,
-        False,
-        tf_sysroot,
-    )
-
-    return includes_cpp + [
-        inc
-        for inc in includes_c
-        if inc not in includes_cpp
-    ]
-
 def auto_configure_fail(msg):
     """Output failure message when cuda configuration fails."""
     red = "\033[0;31m"
@@ -1293,6 +1134,7 @@ def _create_local_cuda_repository(repository_ctx):
 
     cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
     cuda_defines["%{unfiltered_compile_flags}"] = ""
+    cuda_defines["%{cuda_nvcc_files}"] = "[]"
     if is_cuda_clang and not is_nvcc_and_clang:
         cuda_defines["%{host_compiler_path}"] = str(cc)
         cuda_defines["%{host_compiler_warnings}"] = """
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/hermetic_cuda_configure.bzl b/third_party/xla/third_party/tsl/third_party/gpus/hermetic_cuda_configure.bzl
new file mode 100644
index 00000000000000..5d16aa6f76a1f4
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/hermetic_cuda_configure.bzl
@@ -0,0 +1,570 @@
+"""Repository rule for hermetic CUDA autoconfiguration.
+
+`hermetic_cuda_configure` depends on the following environment variables:
+
+  * `TF_NEED_CUDA`: Whether to enable building with CUDA.
+  * `TF_NVCC_CLANG`: Whether to use clang for C++ and NVCC for Cuda compilation.
+  * `CLANG_CUDA_COMPILER_PATH`: The clang compiler path that will be used for
+    both host and device code compilation.
+  * `TF_SYSROOT`: The sysroot to use when compiling.
+  * `TF_CUDA_VERSION`: The version of the CUDA toolkit (mandatory).
+  * `TF_CUDA_COMPUTE_CAPABILITIES`: The CUDA compute capabilities. Default is
+    `3.5,5.2`.
+  * `PYTHON_BIN_PATH`: The python binary path
+"""
+
+load(
+    "//third_party/remote_config:common.bzl",
+    "get_cpu_value",
+    "get_host_environ",
+    "which",
+)
+load(
+    ":compiler_common_tools.bzl",
+    "get_cxx_inc_directories",
+    "to_list_of_strings",
+)
+
+def _find_cc(repository_ctx):
+    """Find the C++ compiler."""
+    cc_path_envvar = _CLANG_CUDA_COMPILER_PATH
+    cc_name = "clang"
+
+    cc_name_from_env = get_host_environ(repository_ctx, cc_path_envvar)
+    if cc_name_from_env:
+        cc_name = cc_name_from_env
+    if cc_name.startswith("/"):
+        # Return the absolute path.
+        return cc_name
+    cc = which(repository_ctx, cc_name)
+    if cc == None:
+        fail(("Cannot find {}, either correct your path or set the {}" +
+              " environment variable").format(cc_name, cc_path_envvar))
+    return cc
+
+def _auto_configure_fail(msg):
+    """Output failure message when cuda configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg))
+
+def _lib_name(base_name, cpu_value, version = None, static = False):
+    """Constructs the platform-specific name of a library.
+
+    Args:
+    base_name: The name of the library, such as "cudart"
+    cpu_value: The name of the host operating system.
+    version: The version of the library.
+    static: True the library is static or False if it is a shared object.
+
+    Returns:
+    The platform-specific name of the library.
+    """
+    version = "" if not version else "." + version
+    if cpu_value in ("Linux"):
+        if static:
+            return "lib%s.a" % base_name
+        return "lib%s.so%s" % (base_name, version)
+    elif cpu_value == "Windows":
+        return "%s.lib" % base_name
+    elif cpu_value == "Darwin":
+        if static:
+            return "lib%s.a" % base_name
+        return "lib%s%s.dylib" % (base_name, version)
+    else:
+        _auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
+
+def _verify_build_defines(params):
+    """Verify all variables that crosstool/BUILD.tpl expects are substituted.
+
+    Args:
+      params: dict of variables that will be passed to the BUILD.tpl template.
+    """
+    missing = []
+    for param in [
+        "cxx_builtin_include_directories",
+        "extra_no_canonical_prefixes_flags",
+        "host_compiler_path",
+        "host_compiler_prefix",
+        "host_compiler_warnings",
+        "linker_bin_path",
+        "compiler_deps",
+        "msvc_cl_path",
+        "msvc_env_include",
+        "msvc_env_lib",
+        "msvc_env_path",
+        "msvc_env_tmp",
+        "msvc_lib_path",
+        "msvc_link_path",
+        "msvc_ml_path",
+        "unfiltered_compile_flags",
+        "win_compiler_deps",
+    ]:
+        if ("%{" + param + "}") not in params:
+            missing.append(param)
+
+    if missing:
+        _auto_configure_fail(
+            "BUILD.tpl template is missing these variables: " +
+            str(missing) +
+            ".\nWe only got: " +
+            str(params) +
+            ".",
+        )
+
+def get_cuda_version(repository_ctx):
+    return get_host_environ(repository_ctx, _TF_CUDA_VERSION)
+
+def enable_cuda(repository_ctx):
+    """Returns whether to build with CUDA support."""
+    return int(get_host_environ(repository_ctx, TF_NEED_CUDA, False))
+
+def _flag_enabled(repository_ctx, flag_name):
+    return get_host_environ(repository_ctx, flag_name) == "1"
+
+def _use_nvcc_and_clang(repository_ctx):
+    # Returns the flag if we need to use clang for C++ and NVCC for Cuda.
+    return _flag_enabled(repository_ctx, _TF_NVCC_CLANG)
+
+def _tf_sysroot(repository_ctx):
+    return get_host_environ(repository_ctx, _TF_SYSROOT, "")
+
+def _py_tmpl_dict(d):
+    return {"%{cuda_config}": str(d)}
+
+def _cudart_static_linkopt(cpu_value):
+    """Returns additional platform-specific linkopts for cudart."""
+    return "\"\"," if cpu_value == "Darwin" else "\"-lrt\","
+
+def _compute_capabilities(repository_ctx):
+    """Returns a list of strings representing cuda compute capabilities.
+
+    Args:
+      repository_ctx: the repo rule's context.
+
+    Returns:
+      list of cuda architectures to compile for. 'compute_xy' refers to
+      both PTX and SASS, 'sm_xy' refers to SASS only.
+    """
+    capabilities = get_host_environ(
+        repository_ctx,
+        _TF_CUDA_COMPUTE_CAPABILITIES,
+        "compute_35,compute_52",
+    ).split(",")
+
+    # Map old 'x.y' capabilities to 'compute_xy'.
+    if len(capabilities) > 0 and all([len(x.split(".")) == 2 for x in capabilities]):
+        # If all capabilities are in 'x.y' format, only include PTX for the
+        # highest capability.
+        cc_list = sorted([x.replace(".", "") for x in capabilities])
+        capabilities = ["sm_%s" % x for x in cc_list[:-1]] + ["compute_%s" % cc_list[-1]]
+    for i, capability in enumerate(capabilities):
+        parts = capability.split(".")
+        if len(parts) != 2:
+            continue
+        capabilities[i] = "compute_%s%s" % (parts[0], parts[1])
+
+    # Make list unique
+    capabilities = dict(zip(capabilities, capabilities)).keys()
+
+    # Validate capabilities.
+    for capability in capabilities:
+        if not capability.startswith(("compute_", "sm_")):
+            _auto_configure_fail("Invalid compute capability: %s" % capability)
+        for prefix in ["compute_", "sm_"]:
+            if not capability.startswith(prefix):
+                continue
+            if len(capability) == len(prefix) + 2 and capability[-2:].isdigit():
+                continue
+            if len(capability) == len(prefix) + 3 and capability.endswith("90a"):
+                continue
+            _auto_configure_fail("Invalid compute capability: %s" % capability)
+
+    return capabilities
+
+def _compute_cuda_extra_copts(compute_capabilities):
+    copts = ["--no-cuda-include-ptx=all"]
+    for capability in compute_capabilities:
+        if capability.startswith("compute_"):
+            capability = capability.replace("compute_", "sm_")
+            copts.append("--cuda-include-ptx=%s" % capability)
+        copts.append("--cuda-gpu-arch=%s" % capability)
+
+    return str(copts)
+
+def _get_cuda_config(repository_ctx):
+    """Detects and returns information about the CUDA installation on the system.
+
+      Args:
+        repository_ctx: The repository context.
+
+      Returns:
+        A struct containing the following fields:
+          cuda_version: The version of CUDA on the system.
+          cudart_version: The CUDA runtime version on the system.
+          cudnn_version: The version of cuDNN on the system.
+          compute_capabilities: A list of the system's CUDA compute capabilities.
+          cpu_value: The name of the host operating system.
+      """
+
+    return struct(
+        cuda_version = get_cuda_version(repository_ctx),
+        cupti_version = repository_ctx.read(repository_ctx.attr.cupti_version),
+        cudart_version = repository_ctx.read(repository_ctx.attr.cudart_version),
+        cublas_version = repository_ctx.read(repository_ctx.attr.cublas_version),
+        cusolver_version = repository_ctx.read(repository_ctx.attr.cusolver_version),
+        curand_version = repository_ctx.read(repository_ctx.attr.curand_version),
+        cufft_version = repository_ctx.read(repository_ctx.attr.cufft_version),
+        cusparse_version = repository_ctx.read(repository_ctx.attr.cusparse_version),
+        cudnn_version = repository_ctx.read(repository_ctx.attr.cudnn_version),
+        compute_capabilities = _compute_capabilities(repository_ctx),
+        cpu_value = get_cpu_value(repository_ctx),
+    )
+
+_DUMMY_CROSSTOOL_BZL_FILE = """
+def error_gpu_disabled():
+  fail("ERROR: Building with --config=cuda but TensorFlow is not configured " +
+       "to build with GPU support. Please re-run ./configure and enter 'Y' " +
+       "at the prompt to build with GPU support.")
+
+  native.genrule(
+      name = "error_gen_crosstool",
+      outs = ["CROSSTOOL"],
+      cmd = "echo 'Should not be run.' && exit 1",
+  )
+
+  native.filegroup(
+      name = "crosstool",
+      srcs = [":CROSSTOOL"],
+      output_licenses = ["unencumbered"],
+  )
+"""
+
+_DUMMY_CROSSTOOL_BUILD_FILE = """
+load("//crosstool:error_gpu_disabled.bzl", "error_gpu_disabled")
+
+error_gpu_disabled()
+"""
+
+def _create_dummy_repository(repository_ctx):
+    cpu_value = get_cpu_value(repository_ctx)
+
+    # Set up BUILD file for cuda/.
+    repository_ctx.template(
+        "cuda/build_defs.bzl",
+        repository_ctx.attr.build_defs_tpl,
+        {
+            "%{cuda_is_configured}": "False",
+            "%{cuda_extra_copts}": "[]",
+            "%{cuda_gpu_architectures}": "[]",
+            "%{cuda_version}": "0.0",
+        },
+    )
+
+    repository_ctx.template(
+        "cuda/BUILD",
+        repository_ctx.attr.dummy_cuda_build_tpl,
+        {
+            "%{cuda_driver_lib}": _lib_name("cuda", cpu_value),
+            "%{cudart_static_lib}": _lib_name(
+                "cudart_static",
+                cpu_value,
+                static = True,
+            ),
+            "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
+            "%{cudart_lib}": _lib_name("cudart", cpu_value),
+            "%{cublas_lib}": _lib_name("cublas", cpu_value),
+            "%{cublasLt_lib}": _lib_name("cublasLt", cpu_value),
+            "%{cusolver_lib}": _lib_name("cusolver", cpu_value),
+            "%{cudnn_lib}": _lib_name("cudnn", cpu_value),
+            "%{cufft_lib}": _lib_name("cufft", cpu_value),
+            "%{curand_lib}": _lib_name("curand", cpu_value),
+            "%{cupti_lib}": _lib_name("cupti", cpu_value),
+            "%{cusparse_lib}": _lib_name("cusparse", cpu_value),
+            "%{cub_actual}": ":cuda_headers",
+            "%{copy_rules}": """
+filegroup(name="cuda-include")
+filegroup(name="cublas-include")
+filegroup(name="cusolver-include")
+filegroup(name="cufft-include")
+filegroup(name="cusparse-include")
+filegroup(name="curand-include")
+filegroup(name="cudnn-include")
+""",
+        },
+    )
+
+    # Create dummy files for the CUDA toolkit since they are still required by
+    # tensorflow/tsl/platform/default/build_config:cuda.
+    repository_ctx.file("cuda/cuda/include/cuda.h")
+    repository_ctx.file("cuda/cuda/include/cublas.h")
+    repository_ctx.file("cuda/cuda/include/cudnn.h")
+    repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h")
+    repository_ctx.file("cuda/cuda/nvml/include/nvml.h")
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cuda", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudart", cpu_value))
+    repository_ctx.file(
+        "cuda/cuda/lib/%s" % _lib_name("cudart_static", cpu_value),
+    )
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cublas", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cublasLt", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cusolver", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudnn", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("curand", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cufft", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cupti", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cusparse", cpu_value))
+
+    # Set up cuda_config.h, which is used by
+    # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
+    repository_ctx.template(
+        "cuda/cuda/cuda_config.h",
+        repository_ctx.attr.cuda_config_tpl,
+        {
+            "%{cuda_version}": "",
+            "%{cudart_version}": "",
+            "%{cupti_version}": "",
+            "%{cublas_version}": "",
+            "%{cusolver_version}": "",
+            "%{curand_version}": "",
+            "%{cufft_version}": "",
+            "%{cusparse_version}": "",
+            "%{cudnn_version}": "",
+            "%{cuda_toolkit_path}": "",
+            "%{cuda_compute_capabilities}": "",
+        },
+    )
+
+    # Set up cuda_config.py, which is used by gen_build_info to provide
+    # static build environment info to the API
+    repository_ctx.template(
+        "cuda/cuda/cuda_config.py",
+        repository_ctx.attr.cuda_config_py_tpl,
+        _py_tmpl_dict({}),
+    )
+
+    # If cuda_configure is not configured to build with GPU support, and the user
+    # attempts to build with --config=cuda, add a dummy build rule to intercept
+    # this and fail with an actionable error message.
+    repository_ctx.file(
+        "crosstool/error_gpu_disabled.bzl",
+        _DUMMY_CROSSTOOL_BZL_FILE,
+    )
+    repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
+
+def _create_local_cuda_repository(repository_ctx):
+    """Creates the repository containing files set up to build with CUDA."""
+    cuda_config = _get_cuda_config(repository_ctx)
+
+    # Set up BUILD file for cuda/
+    repository_ctx.template(
+        "cuda/build_defs.bzl",
+        repository_ctx.attr.build_defs_tpl,
+        {
+            "%{cuda_is_configured}": "True",
+            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
+                cuda_config.compute_capabilities,
+            ),
+            "%{cuda_gpu_architectures}": str(cuda_config.compute_capabilities),
+            "%{cuda_version}": cuda_config.cuda_version,
+        },
+    )
+
+    repository_ctx.template(
+        "cuda/BUILD",
+        repository_ctx.attr.cuda_build_tpl,
+        {
+            "%{cudart_static_linkopt}": _cudart_static_linkopt(cuda_config.cpu_value),
+            "%{cub_actual}": ":cuda_headers",
+        },
+    )
+
+    is_nvcc_and_clang = _use_nvcc_and_clang(repository_ctx)
+    tf_sysroot = _tf_sysroot(repository_ctx)
+
+    # Set up crosstool/
+    cc = _find_cc(repository_ctx)
+    host_compiler_includes = get_cxx_inc_directories(
+        repository_ctx,
+        cc,
+        tf_sysroot,
+    )
+
+    cuda_defines = {}
+
+    # We do not support hermetic CUDA on Windows.
+    # This ensures the CROSSTOOL file parser is happy.
+    cuda_defines.update({
+        "%{msvc_env_tmp}": "msvc_not_used",
+        "%{msvc_env_path}": "msvc_not_used",
+        "%{msvc_env_include}": "msvc_not_used",
+        "%{msvc_env_lib}": "msvc_not_used",
+        "%{msvc_cl_path}": "msvc_not_used",
+        "%{msvc_ml_path}": "msvc_not_used",
+        "%{msvc_link_path}": "msvc_not_used",
+        "%{msvc_lib_path}": "msvc_not_used",
+        "%{win_compiler_deps}": ":empty",
+    })
+
+    cuda_defines["%{builtin_sysroot}"] = tf_sysroot
+    cuda_defines["%{cuda_toolkit_path}"] = repository_ctx.attr.nvcc_binary.workspace_root
+    cuda_defines["%{compiler}"] = "clang"
+    cuda_defines["%{host_compiler_prefix}"] = "/usr/bin"
+    cuda_defines["%{linker_bin_path}"] = ""
+    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
+    cuda_defines["%{unfiltered_compile_flags}"] = ""
+    cuda_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(host_compiler_includes)
+    cuda_defines["%{cuda_nvcc_files}"] = "if_cuda([\"@{nvcc_archive}//:bin\", \"@{nvcc_archive}//:nvvm\"])".format(nvcc_archive = repository_ctx.attr.nvcc_binary.repo_name)
+
+    if not is_nvcc_and_clang:
+        cuda_defines["%{host_compiler_path}"] = str(cc)
+        cuda_defines["%{host_compiler_warnings}"] = """
+        # Some parts of the codebase set -Werror and hit this warning, so
+        # switch it off for now.
+        "-Wno-invalid-partial-specialization"
+    """
+        cuda_defines["%{compiler_deps}"] = ":cuda_nvcc_files"
+        repository_ctx.file(
+            "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
+            "",
+        )
+    else:
+        cuda_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
+        cuda_defines["%{host_compiler_warnings}"] = ""
+
+        nvcc_relative_path = "%s/%s" % (repository_ctx.attr.nvcc_binary.workspace_root, repository_ctx.attr.nvcc_binary.name)
+        cuda_defines["%{compiler_deps}"] = ":crosstool_wrapper_driver_is_not_gcc"
+
+        wrapper_defines = {
+            "%{cpu_compiler}": str(cc),
+            "%{cuda_version}": cuda_config.cuda_version,
+            "%{nvcc_path}": nvcc_relative_path,
+            "%{host_compiler_path}": str(cc),
+            "%{use_clang_compiler}": "True",
+        }
+        repository_ctx.template(
+            "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
+            repository_ctx.attr.crosstool_wrapper_driver_is_not_gcc_tpl,
+            wrapper_defines,
+        )
+
+    _verify_build_defines(cuda_defines)
+
+    # Only expand template variables in the BUILD file
+    repository_ctx.template(
+        "crosstool/BUILD",
+        repository_ctx.attr.crosstool_build_tpl,
+        cuda_defines,
+    )
+
+    # No templating of cc_toolchain_config - use attributes and templatize the
+    # BUILD file.
+    repository_ctx.template(
+        "crosstool/cc_toolchain_config.bzl",
+        repository_ctx.attr.cc_toolchain_config_tpl,
+        {},
+    )
+
+    # Set up cuda_config.h, which is used by
+    # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
+    repository_ctx.template(
+        "cuda/cuda/cuda_config.h",
+        repository_ctx.attr.cuda_config_tpl,
+        {
+            "%{cuda_version}": cuda_config.cuda_version,
+            "%{cudart_version}": cuda_config.cudart_version,
+            "%{cupti_version}": cuda_config.cupti_version,
+            "%{cublas_version}": cuda_config.cublas_version,
+            "%{cusolver_version}": cuda_config.cusolver_version,
+            "%{curand_version}": cuda_config.curand_version,
+            "%{cufft_version}": cuda_config.cufft_version,
+            "%{cusparse_version}": cuda_config.cusparse_version,
+            "%{cudnn_version}": cuda_config.cudnn_version,
+            "%{cuda_toolkit_path}": "",
+            "%{cuda_compute_capabilities}": ", ".join([
+                cc.split("_")[1]
+                for cc in cuda_config.compute_capabilities
+            ]),
+        },
+    )
+
+    # Set up cuda_config.py, which is used by gen_build_info to provide
+    # static build environment info to the API
+    repository_ctx.template(
+        "cuda/cuda/cuda_config.py",
+        repository_ctx.attr.cuda_config_py_tpl,
+        _py_tmpl_dict({
+            "cuda_version": cuda_config.cuda_version,
+            "cudnn_version": cuda_config.cudnn_version,
+            "cuda_compute_capabilities": cuda_config.compute_capabilities,
+            "cpu_compiler": str(cc),
+        }),
+    )
+
+def _cuda_autoconf_impl(repository_ctx):
+    """Implementation of the cuda_autoconf repository rule."""
+    build_file = repository_ctx.attr.local_config_cuda_build_file
+
+    if not enable_cuda(repository_ctx):
+        _create_dummy_repository(repository_ctx)
+    else:
+        _create_local_cuda_repository(repository_ctx)
+
+    repository_ctx.symlink(build_file, "BUILD")
+
+_CLANG_CUDA_COMPILER_PATH = "CLANG_CUDA_COMPILER_PATH"
+_PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
+_TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
+_TF_CUDA_VERSION = "TF_CUDA_VERSION"
+TF_NEED_CUDA = "TF_NEED_CUDA"
+_TF_NVCC_CLANG = "TF_NVCC_CLANG"
+_TF_SYSROOT = "TF_SYSROOT"
+
+_ENVIRONS = [
+    _CLANG_CUDA_COMPILER_PATH,
+    TF_NEED_CUDA,
+    _TF_NVCC_CLANG,
+    _TF_CUDA_VERSION,
+    _TF_CUDA_COMPUTE_CAPABILITIES,
+    _TF_SYSROOT,
+    _PYTHON_BIN_PATH,
+    "TMP",
+    "TMPDIR",
+]
+
+hermetic_cuda_configure = repository_rule(
+    implementation = _cuda_autoconf_impl,
+    environ = _ENVIRONS,
+    attrs = {
+        "environ": attr.string_dict(),
+        "cublas_version": attr.label(default = Label("@cuda_cublas//:version.txt")),
+        "cudart_version": attr.label(default = Label("@cuda_cudart//:version.txt")),
+        "cudnn_version": attr.label(default = Label("@cuda_cudnn//:version.txt")),
+        "cufft_version": attr.label(default = Label("@cuda_cufft//:version.txt")),
+        "cupti_version": attr.label(default = Label("@cuda_cupti//:version.txt")),
+        "curand_version": attr.label(default = Label("@cuda_curand//:version.txt")),
+        "cusolver_version": attr.label(default = Label("@cuda_cusolver//:version.txt")),
+        "cusparse_version": attr.label(default = Label("@cuda_cusparse//:version.txt")),
+        "nvcc_binary": attr.label(default = Label("@cuda_nvcc//:bin/nvcc")),
+        "local_config_cuda_build_file": attr.label(default = Label("//third_party/gpus:local_config_cuda.BUILD")),
+        "build_defs_tpl": attr.label(default = Label("//third_party/gpus/cuda:build_defs.bzl.tpl")),
+        "cuda_build_tpl": attr.label(default = Label("//third_party/gpus/cuda:BUILD.hermetic.tpl")),
+        "dummy_cuda_build_tpl": attr.label(default = Label("//third_party/gpus/cuda:BUILD.tpl")),
+        "cuda_config_tpl": attr.label(default = Label("//third_party/gpus/cuda:cuda_config.h.tpl")),
+        "cuda_config_py_tpl": attr.label(default = Label("//third_party/gpus/cuda:cuda_config.py.tpl")),
+        "crosstool_wrapper_driver_is_not_gcc_tpl": attr.label(default = Label("//third_party/gpus/crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl")),
+        "crosstool_build_tpl": attr.label(default = Label("//third_party/gpus/crosstool:BUILD.tpl")),
+        "cc_toolchain_config_tpl": attr.label(default = Label("//third_party/gpus/crosstool:cc_toolchain_config.bzl.tpl")),
+    },
+)
+"""Detects and configures the hermetic CUDA toolchain.
+
+Add the following to your WORKSPACE FILE:
+
+```python
+hermetic cuda_configure(name = "local_config_cuda")
+```
+
+Args:
+  name: A unique name for this workspace rule.
+"""
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl b/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
index 0fd4019fc5bb75..cf756b452e3950 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
@@ -22,12 +22,15 @@ load(
     "realpath",
     "which",
 )
+load(
+    ":compiler_common_tools.bzl",
+    "to_list_of_strings",
+)
 load(
     ":cuda_configure.bzl",
     "enable_cuda",
     "make_copy_dir_rule",
     "make_copy_files_rule",
-    "to_list_of_strings",
 )
 load(
     ":sycl_configure.bzl",
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/sycl_configure.bzl b/third_party/xla/third_party/tsl/third_party/gpus/sycl_configure.bzl
index 05330b2fe53195..dd80694e7274f5 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/sycl_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/sycl_configure.bzl
@@ -16,11 +16,14 @@ load(
     "realpath",
     "which",
 )
+load(
+    ":compiler_common_tools.bzl",
+    "to_list_of_strings",
+)
 load(
     ":cuda_configure.bzl",
     "make_copy_dir_rule",
     "make_copy_files_rule",
-    "to_list_of_strings",
 )
 
 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
diff --git a/third_party/xla/third_party/tsl/third_party/nccl/build_defs.bzl.tpl b/third_party/xla/third_party/tsl/third_party/nccl/build_defs.bzl.tpl
index 53a6d4e1e41890..a0930df34ecec8 100644
--- a/third_party/xla/third_party/tsl/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/xla/third_party/tsl/third_party/nccl/build_defs.bzl.tpl
@@ -5,7 +5,6 @@ load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
 
 # CUDA toolkit version as tuple (e.g. '(11, 1)').
 _cuda_version = %{cuda_version}
-_cuda_clang = %{cuda_clang}
 
 def _rdc_copts():
     """Returns copts for compiling relocatable device code."""
@@ -121,25 +120,25 @@ _device_link = rule(
         "gpu_archs": attr.string_list(),
         "nvlink_args": attr.string_list(),
         "_nvlink": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/nvlink"),
+            default = Label("%{nvlink_label}"),
             allow_single_file = True,
             executable = True,
             cfg = "host",
         ),
         "_fatbinary": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/fatbinary"),
+            default = Label("%{fatbinary_label}"),
             allow_single_file = True,
             executable = True,
             cfg = "host",
         ),
         "_bin2c": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/bin2c"),
+            default = Label("%{bin2c_label}"),
             allow_single_file = True,
             executable = True,
             cfg = "host",
         ),
         "_link_stub": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/crt/link.stub"),
+            default = Label("%{link_stub_label}"),
             allow_single_file = True,
         ),
     },
@@ -189,7 +188,7 @@ _prune_relocatable_code = rule(
         "input": attr.label(mandatory = True, allow_files = True),
         "gpu_archs": attr.string_list(),
         "_nvprune": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/nvprune"),
+            default = Label("%{nvprune_label}"),
             allow_single_file = True,
             executable = True,
             cfg = "host",
diff --git a/third_party/xla/third_party/tsl/third_party/nccl/hermetic_nccl_configure.bzl b/third_party/xla/third_party/tsl/third_party/nccl/hermetic_nccl_configure.bzl
new file mode 100644
index 00000000000000..b99cbcb08db58a
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/nccl/hermetic_nccl_configure.bzl
@@ -0,0 +1,153 @@
+"""Repository rule for hermetic NCCL configuration.
+
+`hermetic_nccl_configure` depends on the following environment variables:
+
+  * `TF_NCCL_USE_STUB`: "1" if a NCCL stub that loads NCCL dynamically should
+    be used, "0" if NCCL should be linked in statically.
+
+"""
+
+load(
+    "//third_party/gpus:hermetic_cuda_configure.bzl",
+    "TF_NEED_CUDA",
+    "enable_cuda",
+    "get_cuda_version",
+)
+load(
+    "//third_party/remote_config:common.bzl",
+    "get_cpu_value",
+    "get_host_environ",
+)
+
+_TF_NCCL_USE_STUB = "TF_NCCL_USE_STUB"
+
+_NCCL_DUMMY_BUILD_CONTENT = """
+filegroup(
+  name = "LICENSE",
+  visibility = ["//visibility:public"],
+)
+
+cc_library(
+  name = "nccl",
+  visibility = ["//visibility:public"],
+)
+
+cc_library(
+  name = "nccl_config",
+  hdrs = ["nccl_config.h"],
+  include_prefix = "third_party/nccl",
+  visibility = ["//visibility:public"],
+)
+"""
+
+_NCCL_ARCHIVE_BUILD_CONTENT = """
+filegroup(
+  name = "LICENSE",
+  data = ["@nccl_archive//:LICENSE.txt"],
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl",
+  actual = "@nccl_archive//:nccl",
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl_config",
+  actual = "@nccl_archive//:nccl_config",
+  visibility = ["//visibility:public"],
+)
+"""
+
+_NCCL_ARCHIVE_STUB_BUILD_CONTENT = """
+alias(
+  name = "nccl_lib",
+  actual = "@cuda_nccl//:nccl_lib",
+)
+
+filegroup(
+  name = "LICENSE",
+  data = ["@nccl_archive//:LICENSE.txt"],
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl",
+  actual = "@nccl_archive//:nccl_via_stub",
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl_headers",
+  actual = "@nccl_archive//:nccl_headers",
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl_config",
+  actual = "@nccl_archive//:nccl_config",
+  visibility = ["//visibility:public"],
+)
+"""
+
+def _create_local_nccl_repository(repository_ctx):
+    cuda_version = get_cuda_version(repository_ctx)
+    if cuda_version == "12":
+        cuda_version = "12.3"
+    cuda_version = cuda_version.split(".")
+
+    # Alias to open source build from @nccl_archive.
+    if get_host_environ(repository_ctx, _TF_NCCL_USE_STUB, "0") == "0":
+        repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
+    else:
+        repository_ctx.file("BUILD", _NCCL_ARCHIVE_STUB_BUILD_CONTENT)
+
+    repository_ctx.template("generated_names.bzl", repository_ctx.attr.generated_names_tpl, {})
+    repository_ctx.template(
+        "build_defs.bzl",
+        repository_ctx.attr.build_defs_tpl,
+        {
+            "%{cuda_version}": "(%s, %s)" % tuple(cuda_version),
+            "%{nvlink_label}": "@cuda_nvcc//:nvlink",
+            "%{fatbinary_label}": "@cuda_nvcc//:fatbinary",
+            "%{bin2c_label}": "@cuda_nvcc//:bin2c",
+            "%{link_stub_label}": "@cuda_nvcc//:link_stub",
+            "%{nvprune_label}": "@cuda_nvprune//:nvprune",
+        },
+    )
+
+def _nccl_autoconf_impl(repository_ctx):
+    if (not enable_cuda(repository_ctx) or
+        get_cpu_value(repository_ctx) not in ("Linux", "FreeBSD")):
+        # Add a dummy build file to make bazel query happy.
+        repository_ctx.file("BUILD", _NCCL_DUMMY_BUILD_CONTENT)
+        repository_ctx.file("nccl_config.h", "#define TF_NCCL_VERSION \"\"")
+    else:
+        _create_local_nccl_repository(repository_ctx)
+
+_ENVIRONS = [
+    TF_NEED_CUDA,
+]
+
+hermetic_nccl_configure = repository_rule(
+    environ = _ENVIRONS,
+    implementation = _nccl_autoconf_impl,
+    attrs = {
+        "environ": attr.string_dict(),
+        "generated_names_tpl": attr.label(default = Label("//third_party/nccl:generated_names.bzl.tpl")),
+        "build_defs_tpl": attr.label(default = Label("//third_party/nccl:build_defs.bzl.tpl")),
+        "system_build_tpl": attr.label(default = Label("//third_party/nccl:system.BUILD.tpl")),
+    },
+)
+"""Downloads and configures the hermetic NCCL configuration.
+
+Add the following to your WORKSPACE FILE:
+
+```python
+hermetic_nccl_configure(name = "local_config_nccl")
+```
+
+Args:
+  name: A unique name for this workspace rule.
+"""
diff --git a/third_party/xla/third_party/tsl/third_party/nccl/nccl_configure.bzl b/third_party/xla/third_party/tsl/third_party/nccl/nccl_configure.bzl
index 22cf64d4771062..4da2513e03eb44 100644
--- a/third_party/xla/third_party/tsl/third_party/nccl/nccl_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/nccl/nccl_configure.bzl
@@ -8,7 +8,6 @@
     files.
   * `TF_CUDA_PATHS`: The base paths to look for CUDA and cuDNN. Default is
     `/usr/local/cuda,usr/`.
-  * `TF_CUDA_CLANG`: "1" if using Clang, "0" if using NVCC.
   * `TF_NCCL_USE_STUB`: "1" if a NCCL stub that loads NCCL dynamically should
     be used, "0" if NCCL should be linked in statically.
 
@@ -33,7 +32,6 @@ _TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
 _TF_NCCL_VERSION = "TF_NCCL_VERSION"
 _TF_NEED_CUDA = "TF_NEED_CUDA"
 _TF_CUDA_PATHS = "TF_CUDA_PATHS"
-_TF_CUDA_CLANG = "TF_CUDA_CLANG"
 _TF_NCCL_USE_STUB = "TF_NCCL_USE_STUB"
 
 _DEFINE_NCCL_MAJOR = "#define NCCL_MAJOR"
@@ -129,7 +127,11 @@ def _create_local_nccl_repository(repository_ctx):
             _label("build_defs.bzl.tpl"),
             {
                 "%{cuda_version}": "(%s, %s)" % tuple(cuda_version),
-                "%{cuda_clang}": repr(get_host_environ(repository_ctx, _TF_CUDA_CLANG)),
+                "%{nvlink_label}": "@local_config_cuda//cuda:cuda/bin/nvlink",
+                "%{fatbinary_label}": "@local_config_cuda//cuda:cuda/bin/fatbinary",
+                "%{bin2c_label}": "@local_config_cuda//cuda:cuda/bin/bin2c",
+                "%{link_stub_label}": "@local_config_cuda//cuda:cuda/bin/crt/link.stub",
+                "%{nvprune_label}": "@local_config_cuda//cuda:cuda/bin/nvprune",
             },
         )
     else:
@@ -181,7 +183,6 @@ _ENVIRONS = [
     _TF_CUDA_COMPUTE_CAPABILITIES,
     _TF_NEED_CUDA,
     _TF_CUDA_PATHS,
-    _TF_CUDA_CLANG,
 ]
 
 remote_nccl_configure = repository_rule(
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
index f2eecd61a5faf7..c105ee4544f51f 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
@@ -703,7 +703,7 @@ def initialize_rbe_configs():
             "TF_CUDNN_VERSION": "8.9",
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_SYSROOT": "/dt9",
             "TF_TENSORRT_VERSION": "8.6",
         },
@@ -742,7 +742,7 @@ def initialize_rbe_configs():
             "TF_CUDNN_VERSION": "8.9",
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_SYSROOT": "/dt9",
             "TF_TENSORRT_VERSION": "8.6",
         },
@@ -782,7 +782,7 @@ def initialize_rbe_configs():
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
             "TF_SYSROOT": "/dt9",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_TENSORRT_VERSION": "8.6",
         },
     )
@@ -820,7 +820,7 @@ def initialize_rbe_configs():
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
             "TF_SYSROOT": "/dt9",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_TENSORRT_VERSION": "8.6",
         },
     )
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/rbe_config.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/rbe_config.bzl
index 18a84d96c39f82..9ade984f45351d 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/rbe_config.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/rbe_config.bzl
@@ -1,8 +1,13 @@
 """Macro that creates external repositories for remote config."""
 
-load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
+# If you need to use non-hermetic CUDA, replace the line below with
+# load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
+load("//third_party/gpus:hermetic_cuda_configure.bzl", "hermetic_cuda_configure")
 load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
-load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
+
+# If you need to use non-hermetic CUDA, replace the line below with
+# load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
+load("//third_party/nccl:hermetic_nccl_configure.bzl", "hermetic_nccl_configure")
 load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
 load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//third_party/tensorrt:tensorrt_configure.bzl", "remote_tensorrt_configure")
@@ -42,7 +47,7 @@ def _tensorflow_rbe_config(name, compiler, python_versions, os, rocm_version = N
             "TF_CUDNN_VERSION": cudnn_version,
             "TF_CUDA_VERSION": cuda_version,
             "CUDNN_INSTALL_PATH": cudnn_install_path if cudnn_install_path != None else "/usr/lib/x86_64-linux-gnu",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_TENSORRT_VERSION": tensorrt_version if tensorrt_version != None else "",
             "TENSORRT_INSTALL_PATH": tensorrt_install_path if tensorrt_install_path != None else "/usr/lib/x86_64-linux-gnu",
             "GCC_HOST_COMPILER_PATH": compiler if not compiler.endswith("clang") else "",
@@ -58,13 +63,17 @@ def _tensorflow_rbe_config(name, compiler, python_versions, os, rocm_version = N
             "Pool": "default",
         }
 
-        remote_cuda_configure(
+        # If you need to use non-hermetic CUDA, replace the call below with
+        # remote_cuda_configure.
+        hermetic_cuda_configure(
             name = "%s_config_cuda" % name,
             environ = env,
             exec_properties = exec_properties,
         )
 
-        remote_nccl_configure(
+        # If you need to use non-hermetic CUDA, replace the call below with
+        # remote_nccl_configure.
+        hermetic_nccl_configure(
             name = "%s_config_nccl" % name,
             environ = env,
             exec_properties = exec_properties,
@@ -175,13 +184,15 @@ def sigbuild_tf_configs(name_container_map, env):
             "Pool": "default",
         }
 
-        remote_cuda_configure(
+        # TODO (ybaturina): DEPRECATED: replace with remote_cuda_configure for non-hermetic CUDA.
+        hermetic_cuda_configure(
             name = "%s_config_cuda" % name,
             environ = env,
             exec_properties = exec_properties,
         )
 
-        remote_nccl_configure(
+        # TODO (ybaturina): DEPRECATED: replace with remote_nccl_configure for non-hermetic NCCL.
+        hermetic_nccl_configure(
             name = "%s_config_nccl" % name,
             environ = env,
             exec_properties = exec_properties,
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/BUILD b/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
index ca974552eca1ab..9db8ada08122f6 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
@@ -3,6 +3,7 @@ load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "@local_xla//xla/tsl:tsl.bzl",
+    "if_hermetic_cuda_tools",
     "if_not_fuchsia",
     "if_not_windows",
     "internal_visibility",
@@ -58,6 +59,9 @@ cc_library(
     srcs = ["cuda_libdevice_path.cc"],
     hdrs = ["//tsl/platform:cuda_libdevice_path.h"],
     compatible_with = [],
+    data = if_hermetic_cuda_tools([
+        "@cuda_nvcc//:nvvm",
+    ]),
     tags = [
         "manual",
         "no_oss",
@@ -65,6 +69,7 @@ cc_library(
     ],
     deps = [
         "//tsl/platform",
+        "//tsl/platform:env",
         "//tsl/platform:logging",
         "//tsl/platform:path",
         "//tsl/platform:types",
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/cuda_libdevice_path.cc b/third_party/xla/third_party/tsl/tsl/platform/default/cuda_libdevice_path.cc
index 46321e74b5dc38..ee95a3d17fd68e 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/cuda_libdevice_path.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/cuda_libdevice_path.cc
@@ -31,6 +31,7 @@ limitations under the License.
 
 #if !defined(PLATFORM_GOOGLE)
 #include "third_party/gpus/cuda/cuda_config.h"
+#include "tsl/platform/env.h"
 #endif
 #include "tsl/platform/logging.h"
 
@@ -40,6 +41,17 @@ std::vector<std::string> CandidateCudaRoots() {
 #if !defined(PLATFORM_GOOGLE)
   auto roots = std::vector<std::string>{TF_CUDA_TOOLKIT_PATH,
                                         std::string("/usr/local/cuda")};
+  std::string runfiles_suffix = "runfiles";
+  std::string executable_path = tsl::Env::Default()->GetExecutablePath();
+  std::string cuda_nvcc_dir =
+      io::JoinPath(executable_path + "." + runfiles_suffix, "cuda_nvcc");
+  roots.emplace_back(cuda_nvcc_dir);
+  std::string runfiles_dir = tsl::Env::Default()->GetRunfilesDir();
+  std::size_t runfiles_ind = runfiles_dir.rfind(runfiles_suffix);
+  cuda_nvcc_dir = io::JoinPath(
+      runfiles_dir.substr(0, runfiles_ind + runfiles_suffix.length()),
+      "cuda_nvcc");
+  roots.emplace_back(cuda_nvcc_dir);
 
 #if defined(PLATFORM_POSIX) && !defined(__APPLE__)
   Dl_info info;
@@ -53,6 +65,9 @@ std::vector<std::string> CandidateCudaRoots() {
     // relative to the current binary for the wheel-based nvcc package.
     for (auto path : {"../nvidia/cuda_nvcc", "../../nvidia/cuda_nvcc"})
       roots.emplace_back(io::JoinPath(dir, path));
+
+    // Also add the path to the copy of libdevice.10.bc we include with XLA.
+    roots.emplace_back(io::JoinPath(dir, "cuda"));
   }
 #endif  // defined(PLATFORM_POSIX) && !defined(__APPLE__)
 
diff --git a/third_party/xla/third_party/tsl/workspace2.bzl b/third_party/xla/third_party/tsl/workspace2.bzl
index 001ce018d87066..7274961489c516 100644
--- a/third_party/xla/third_party/tsl/workspace2.bzl
+++ b/third_party/xla/third_party/tsl/workspace2.bzl
@@ -6,6 +6,7 @@ load("@bazel_skylib//lib:versions.bzl", "versions")
 # Import external repository rules.
 load("@bazel_tools//tools/build_defs/repo:java.bzl", "java_import_external")
 load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
+load("//third_party:cuda_repo.bzl", "cuda_distributives")
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 # Import third party repository rules. See go/tfbr-thirdparty.
@@ -17,14 +18,20 @@ load("//third_party/eigen3:workspace.bzl", eigen3 = "repo")
 load("//third_party/farmhash:workspace.bzl", farmhash = "repo")
 load("//third_party/gemmlowp:workspace.bzl", gemmlowp = "repo")
 load("//third_party/git:git_configure.bzl", "git_configure")
-load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
+
+# If you need to use non-hermetic CUDA, replace the line below with
+# load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
+load("//third_party/gpus:hermetic_cuda_configure.bzl", "hermetic_cuda_configure")
 load("//third_party/gpus:rocm_configure.bzl", "rocm_configure")
 load("//third_party/gpus:sycl_configure.bzl", "sycl_configure")
 load("//third_party/hwloc:workspace.bzl", hwloc = "repo")
 load("//third_party/implib_so:workspace.bzl", implib_so = "repo")
 load("//third_party/llvm:setup.bzl", "llvm_setup")
 load("//third_party/nasm:workspace.bzl", nasm = "repo")
-load("//third_party/nccl:nccl_configure.bzl", "nccl_configure")
+
+# If you need to use non-hermetic CUDA, replace the line below with
+# load("//third_party/nccl:nccl_configure.bzl", "nccl_configure")
+load("//third_party/nccl:hermetic_nccl_configure.bzl", "hermetic_nccl_configure")
 load("//third_party/py:python_configure.bzl", "python_configure")
 load("//third_party/py/ml_dtypes:workspace.bzl", ml_dtypes = "repo")
 load("//third_party/pybind11_abseil:workspace.bzl", pybind11_abseil = "repo")
@@ -69,9 +76,15 @@ def _tf_toolchains():
     # Note that we check the minimum bazel version in WORKSPACE.
     clang6_configure(name = "local_config_clang6")
     cc_download_clang_toolchain(name = "local_config_download_clang")
-    cuda_configure(name = "local_config_cuda")
+
+    # If you need to use non-hermetic CUDA, replace the line below with
+    # cuda_configure(name = "local_config_cuda")
+    hermetic_cuda_configure(name = "local_config_cuda")
     tensorrt_configure(name = "local_config_tensorrt")
-    nccl_configure(name = "local_config_nccl")
+
+    # If you need to use non-hermetic CUDA, replace the line below with
+    # nccl_configure(name = "local_config_nccl")
+    hermetic_nccl_configure(name = "local_config_nccl")
     git_configure(name = "local_config_git")
     syslibs_configure(name = "local_config_syslibs")
     python_configure(name = "local_config_python")
@@ -597,6 +610,28 @@ def _tf_repositories():
         urls = tf_mirror_urls("https://github.com/google/glog/archive/refs/tags/v0.4.0.tar.gz"),
     )
 
+_CUDA_12_3_NCCL_WHEEL_DICT = {
+    "x86_64-unknown-linux-gnu": {
+        "url": "https://files.pythonhosted.org/packages/38/00/d0d4e48aef772ad5aebcf70b73028f88db6e5640b36c38e90445b7a57c45/nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl",
+        "sha256": "a9734707a2c96443331c1e48c717024aa6678a0e2a4cb66b2c364d18cee6b48d",
+    },
+    "aarch64-unknown-linux-gnu": {
+        "url": "https://files.pythonhosted.org/packages/c1/bb/d09dda47c881f9ff504afd6f9ca4f502ded6d8fc2f572cacc5e39da91c28/nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_aarch64.whl",
+        "sha256": "1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01",
+    },
+}
+
+_CUDA_12_1_NCCL_WHEEL_DICT = {
+    "x86_64-unknown-linux-gnu": {
+        "url": "https://files.pythonhosted.org/packages/38/00/d0d4e48aef772ad5aebcf70b73028f88db6e5640b36c38e90445b7a57c45/nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl",
+        "sha256": "a9734707a2c96443331c1e48c717024aa6678a0e2a4cb66b2c364d18cee6b48d",
+    },
+    "aarch64-unknown-linux-gnu": {
+        "url": "https://files.pythonhosted.org/packages/c1/bb/d09dda47c881f9ff504afd6f9ca4f502ded6d8fc2f572cacc5e39da91c28/nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_aarch64.whl",
+        "sha256": "1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01",
+    },
+}
+
 def workspace():
     # Check the bazel version before executing any repository rules, in case
     # those rules rely on the version we require here.
@@ -614,6 +649,10 @@ def workspace():
     # don't already exist (at least if the external repository macros were
     # written according to common practice to query native.existing_rule()).
     _tf_repositories()
+    cuda_distributives(cuda_nccl_wheel_dict = {
+        "12.3.2": _CUDA_12_3_NCCL_WHEEL_DICT,
+        "12.1.1": _CUDA_12_1_NCCL_WHEEL_DICT,
+    })
 
 # Alias so it can be loaded without assigning to a different symbol to prevent
 # shadowing previous loads and trigger a buildifier warning.
diff --git a/third_party/xla/third_party/tsl/workspace3.bzl b/third_party/xla/third_party/tsl/workspace3.bzl
index a1293f59a48885..adba216bc518a1 100644
--- a/third_party/xla/third_party/tsl/workspace3.bzl
+++ b/third_party/xla/third_party/tsl/workspace3.bzl
@@ -1,8 +1,31 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("//third_party:cuda_redist_json_repo.bzl", "cuda_redist_json")
 load("//third_party/llvm:workspace.bzl", llvm = "repo")
 
+_CUDA_REDIST_JSON_DICT = {
+    "12.1.1": [
+        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.1.1.json",
+        "bafea3cb83a4cf5c764eeedcaac0040d0d3c5db3f9a74550da0e7b6ac24d378c",
+    ],
+    "12.3.2": [
+        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.3.2.json",
+        "1b6eacf335dd49803633fed53ef261d62c193e5a56eee5019e7d2f634e39e7ef",
+    ],
+}
+
+_CUDNN_REDIST_JSON_DICT = {
+    "8.6": [
+        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_8.6.0.json",
+        "7f6f50bed4fd8216dc10d6ef505771dc0ecc99cce813993ab405cb507a21d51d",
+    ],
+    "8.9.7.29": [
+        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_8.9.7.29.json",
+        "a0734f26f068522464fa09b2f2c186dfbe6ad7407a88ea0c50dd331f0c3389ec",
+    ],
+}
+
 def workspace():
     http_archive(
         name = "io_bazel_rules_closure",
@@ -46,6 +69,13 @@ def workspace():
     # but provides a script for setting up build rules via overlays.
     llvm("llvm-raw")
 
+    # Load JSON files for CUDA and cuDNN distribution versions.
+    cuda_redist_json(
+        name = "cuda_redist_json",
+        cuda_json_dict = _CUDA_REDIST_JSON_DICT,
+        cudnn_json_dict = _CUDNN_REDIST_JSON_DICT,
+    )
+
 # Alias so it can be loaded without assigning to a different symbol to prevent
 # shadowing previous loads and trigger a buildifier warning.
 tsl_workspace3 = workspace
diff --git a/third_party/xla/tools/toolchains/remote_config/configs.bzl b/third_party/xla/tools/toolchains/remote_config/configs.bzl
index f2eecd61a5faf7..c105ee4544f51f 100644
--- a/third_party/xla/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/configs.bzl
@@ -703,7 +703,7 @@ def initialize_rbe_configs():
             "TF_CUDNN_VERSION": "8.9",
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_SYSROOT": "/dt9",
             "TF_TENSORRT_VERSION": "8.6",
         },
@@ -742,7 +742,7 @@ def initialize_rbe_configs():
             "TF_CUDNN_VERSION": "8.9",
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_SYSROOT": "/dt9",
             "TF_TENSORRT_VERSION": "8.6",
         },
@@ -782,7 +782,7 @@ def initialize_rbe_configs():
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
             "TF_SYSROOT": "/dt9",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_TENSORRT_VERSION": "8.6",
         },
     )
@@ -820,7 +820,7 @@ def initialize_rbe_configs():
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
             "TF_SYSROOT": "/dt9",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_TENSORRT_VERSION": "8.6",
         },
     )
diff --git a/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl b/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl
index 18a84d96c39f82..9ade984f45351d 100644
--- a/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl
@@ -1,8 +1,13 @@
 """Macro that creates external repositories for remote config."""
 
-load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
+# If you need to use non-hermetic CUDA, replace the line below with
+# load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
+load("//third_party/gpus:hermetic_cuda_configure.bzl", "hermetic_cuda_configure")
 load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
-load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
+
+# If you need to use non-hermetic CUDA, replace the line below with
+# load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
+load("//third_party/nccl:hermetic_nccl_configure.bzl", "hermetic_nccl_configure")
 load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
 load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//third_party/tensorrt:tensorrt_configure.bzl", "remote_tensorrt_configure")
@@ -42,7 +47,7 @@ def _tensorflow_rbe_config(name, compiler, python_versions, os, rocm_version = N
             "TF_CUDNN_VERSION": cudnn_version,
             "TF_CUDA_VERSION": cuda_version,
             "CUDNN_INSTALL_PATH": cudnn_install_path if cudnn_install_path != None else "/usr/lib/x86_64-linux-gnu",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_TENSORRT_VERSION": tensorrt_version if tensorrt_version != None else "",
             "TENSORRT_INSTALL_PATH": tensorrt_install_path if tensorrt_install_path != None else "/usr/lib/x86_64-linux-gnu",
             "GCC_HOST_COMPILER_PATH": compiler if not compiler.endswith("clang") else "",
@@ -58,13 +63,17 @@ def _tensorflow_rbe_config(name, compiler, python_versions, os, rocm_version = N
             "Pool": "default",
         }
 
-        remote_cuda_configure(
+        # If you need to use non-hermetic CUDA, replace the call below with
+        # remote_cuda_configure.
+        hermetic_cuda_configure(
             name = "%s_config_cuda" % name,
             environ = env,
             exec_properties = exec_properties,
         )
 
-        remote_nccl_configure(
+        # If you need to use non-hermetic CUDA, replace the call below with
+        # remote_nccl_configure.
+        hermetic_nccl_configure(
             name = "%s_config_nccl" % name,
             environ = env,
             exec_properties = exec_properties,
@@ -175,13 +184,15 @@ def sigbuild_tf_configs(name_container_map, env):
             "Pool": "default",
         }
 
-        remote_cuda_configure(
+        # TODO (ybaturina): DEPRECATED: replace with remote_cuda_configure for non-hermetic CUDA.
+        hermetic_cuda_configure(
             name = "%s_config_cuda" % name,
             environ = env,
             exec_properties = exec_properties,
         )
 
-        remote_nccl_configure(
+        # TODO (ybaturina): DEPRECATED: replace with remote_nccl_configure for non-hermetic NCCL.
+        hermetic_nccl_configure(
             name = "%s_config_nccl" % name,
             environ = env,
             exec_properties = exec_properties,
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 5c8aaea5723b70..ac1d1a87becb06 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -1382,6 +1382,7 @@ cc_library(
     ]) + if_cuda_is_configured([
         "//xla/service/gpu:nvptx_compiler",
         "//xla/stream_executor/cuda:stream_executor_cuda",
+        "//xla/tsl:gpu_runtime_hermetic_cuda_deps",
     ]) + if_rocm_is_configured([
         "//xla/service/gpu:amdgpu_compiler",
         "//xla/stream_executor/rocm:stream_executor_rocm",
diff --git a/third_party/xla/xla/service/gpu/tests/add_preds.hlo b/third_party/xla/xla/service/gpu/tests/add_preds.hlo
index 120b6a5ad686bf..b106b806c0470c 100644
--- a/third_party/xla/xla/service/gpu/tests/add_preds.hlo
+++ b/third_party/xla/xla/service/gpu/tests/add_preds.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s
 
 // CHECK: define void @fusion({{.*}}%[[ARG0:.*]], {{.*}}%[[ARG1:.*]],
diff --git a/third_party/xla/xla/service/gpu/tests/calling_convention.hlo b/third_party/xla/xla/service/gpu/tests/calling_convention.hlo
index c84e0194c347cb..ba00bd6423aa74 100644
--- a/third_party/xla/xla/service/gpu/tests/calling_convention.hlo
+++ b/third_party/xla/xla/service/gpu/tests/calling_convention.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // Arguments are passed separately.
diff --git a/third_party/xla/xla/service/gpu/tests/copy.hlo b/third_party/xla/xla/service/gpu/tests/copy.hlo
index beac8e6d36b115..997cefda91b22a 100644
--- a/third_party/xla/xla/service/gpu/tests/copy.hlo
+++ b/third_party/xla/xla/service/gpu/tests/copy.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
diff --git a/third_party/xla/xla/service/gpu/tests/dynamic_update_slice_inplace.hlo b/third_party/xla/xla/service/gpu/tests/dynamic_update_slice_inplace.hlo
index 3d0af18b081103..974fb26e5a9193 100644
--- a/third_party/xla/xla/service/gpu/tests/dynamic_update_slice_inplace.hlo
+++ b/third_party/xla/xla/service/gpu/tests/dynamic_update_slice_inplace.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
diff --git a/third_party/xla/xla/service/gpu/tests/element_wise_row_vectorization.hlo b/third_party/xla/xla/service/gpu/tests/element_wise_row_vectorization.hlo
index b49e155da0a685..05d2b141757621 100644
--- a/third_party/xla/xla/service/gpu/tests/element_wise_row_vectorization.hlo
+++ b/third_party/xla/xla/service/gpu/tests/element_wise_row_vectorization.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/v100.txtpb --split-input-file | FileCheck %s
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK-LLVM %s
 // We check that the row loads are vectorized.
diff --git a/third_party/xla/xla/service/gpu/tests/fused_scatter.hlo b/third_party/xla/xla/service/gpu/tests/fused_scatter.hlo
index 9a30436ebfa38c..e11711b8ba9556 100644
--- a/third_party/xla/xla/service/gpu/tests/fused_scatter.hlo
+++ b/third_party/xla/xla/service/gpu/tests/fused_scatter.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
diff --git a/third_party/xla/xla/service/gpu/tests/fused_slice.hlo b/third_party/xla/xla/service/gpu/tests/fused_slice.hlo
index 4affcb0de7533b..b5abb7dafa5960 100644
--- a/third_party/xla/xla/service/gpu/tests/fused_slice.hlo
+++ b/third_party/xla/xla/service/gpu/tests/fused_slice.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
diff --git a/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo b/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
index 41734e06259a00..431edac2748f42 100644
--- a/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
+++ b/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // All fusions must reuse the same kernel:
diff --git a/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo b/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo
index bcfa37733f7e67..ecdb8e91df4342 100644
--- a/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo
+++ b/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 // This tests that we do not increase the grid launch size when
 // few_waves is enabled.
diff --git a/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo b/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo
index 6e147df3928c09..5f2d6d64eb829d 100644
--- a/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo
+++ b/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_atomic_min.hlo b/third_party/xla/xla/service/gpu/tests/reduce_atomic_min.hlo
index c7165c9e11763c..05537b327fdc57 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_atomic_min.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_atomic_min.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s --check-prefixes=CHECK,CHECK-%{PTX}
 
 // Check that for "min" we are still using atomics (CAS loop).
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_column_layout_change.hlo b/third_party/xla/xla/service/gpu/tests/reduce_column_layout_change.hlo
index cb30643886de4e..1b30f13e0e1af9 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_column_layout_change.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_column_layout_change.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s --check-prefixes=CHECK,CHECK-%{PTX}
 
 HloModule reduce_with_layout_change, is_scheduled=true
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_f64_column.hlo b/third_party/xla/xla/service/gpu/tests/reduce_f64_column.hlo
index 982e45863e2547..70138e77d9362b 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_f64_column.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_f64_column.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s --check-prefixes=CHECK,CHECK-%{PTX}
 
 HloModule m, is_scheduled=true
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo b/third_party/xla/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo
index 736e583ad4c3c9..5fa7f4ed5b5c2c 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK,CHECK-%{PTX}
 
 HloModule LargeReduction, is_scheduled=true
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_row_vectorized.hlo b/third_party/xla/xla/service/gpu/tests/reduce_row_vectorized.hlo
index bba7986d830fb3..7107fd24d491f2 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_row_vectorized.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_row_vectorized.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s --check-prefixes=CHECK,CHECK-%{PTX}
 
 HloModule RowReductionVectorized, is_scheduled=true
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_unnested.hlo b/third_party/xla/xla/service/gpu/tests/reduce_unnested.hlo
index 844c3ded2ef024..919a4dc1e9c7a9 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_unnested.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_unnested.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck %s
 
 // CHECK: define void @fusion_row_reduction_too_small(
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_variadic_column.hlo b/third_party/xla/xla/service/gpu/tests/reduce_variadic_column.hlo
index 36008daa5ceda8..64029a503506b8 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_variadic_column.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_variadic_column.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s --check-prefixes=CHECK,CHECK-%{PTX}
 
 HloModule Test, is_scheduled=true
diff --git a/third_party/xla/xla/service/gpu/tests/reduction_vectorization_sm_all.hlo b/third_party/xla/xla/service/gpu/tests/reduction_vectorization_sm_all.hlo
index 6a25580a4bcff9..a1d7eb6aa38619 100644
--- a/third_party/xla/xla/service/gpu/tests/reduction_vectorization_sm_all.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduction_vectorization_sm_all.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck %s
 // RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/p100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM60
 // RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/v100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM70
diff --git a/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo b/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo
index e140b56af9d60c..7836f3ee1ddbe9 100644
--- a/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo
+++ b/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule TestModule, is_scheduled=true
diff --git a/third_party/xla/xla/service/gpu/tests/scatter.hlo b/third_party/xla/xla/service/gpu/tests/scatter.hlo
index 20211bdbe892f4..b81113587814da 100644
--- a/third_party/xla/xla/service/gpu/tests/scatter.hlo
+++ b/third_party/xla/xla/service/gpu/tests/scatter.hlo
@@ -1,5 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // CHECK-LABEL: entry:
diff --git a/third_party/xla/xla/service/gpu/tests/select_and_scatter.hlo b/third_party/xla/xla/service/gpu/tests/select_and_scatter.hlo
index 08751943c13efb..587a605dca24ee 100644
--- a/third_party/xla/xla/service/gpu/tests/select_and_scatter.hlo
+++ b/third_party/xla/xla/service/gpu/tests/select_and_scatter.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
diff --git a/third_party/xla/xla/service/gpu/tests/single_instruction.hlo b/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
index c8378f746aa983..3fdbc565981679 100644
--- a/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
+++ b/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck %s
 // RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/a100_80.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM80
 // RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/h100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM90
diff --git a/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo b/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
index 242bd749bdaf11..a62181874c323c 100644
--- a/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
+++ b/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
diff --git a/third_party/xla/xla/service/gpu/tests/transpose_021.hlo b/third_party/xla/xla/service/gpu/tests/transpose_021.hlo
index 7d3e1fe0ffb9f6..ef8580fa62ac77 100644
--- a/third_party/xla/xla/service/gpu/tests/transpose_021.hlo
+++ b/third_party/xla/xla/service/gpu/tests/transpose_021.hlo
@@ -1,4 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 HloModule Transpose, is_scheduled=true
diff --git a/third_party/xla/xla/service/gpu/tests/transpose_021_extra_output.hlo b/third_party/xla/xla/service/gpu/tests/transpose_021_extra_output.hlo
index 5e638321294f1e..659a9e2abf3dec 100644
--- a/third_party/xla/xla/service/gpu/tests/transpose_021_extra_output.hlo
+++ b/third_party/xla/xla/service/gpu/tests/transpose_021_extra_output.hlo
@@ -1,4 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 HloModule Transpose, is_scheduled=true
diff --git a/third_party/xla/xla/service/gpu/tests/transpose_210.hlo b/third_party/xla/xla/service/gpu/tests/transpose_210.hlo
index f37bd17ffe2a6e..1fb1dd139797da 100644
--- a/third_party/xla/xla/service/gpu/tests/transpose_210.hlo
+++ b/third_party/xla/xla/service/gpu/tests/transpose_210.hlo
@@ -1,4 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 HloModule Transpose, is_scheduled=true
diff --git a/third_party/xla/xla/service/gpu/tests/transpose_210_extra_output.hlo b/third_party/xla/xla/service/gpu/tests/transpose_210_extra_output.hlo
index a3831d2da1de52..b9693dfc388679 100644
--- a/third_party/xla/xla/service/gpu/tests/transpose_210_extra_output.hlo
+++ b/third_party/xla/xla/service/gpu/tests/transpose_210_extra_output.hlo
@@ -1,4 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 HloModule Transpose, is_scheduled=true
diff --git a/third_party/xla/xla/service/gpu/tests/triton_naming.hlo b/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
index 2739e349181786..36704d7fcb280f 100644
--- a/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
+++ b/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
 
 // CHECK-PTX: define void @triton_gemm_r(
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index 234649a31ea65b..30d0d3d48c51dd 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -24,7 +24,7 @@ load(
     "tf_additional_cudnn_plugin_copts",
     "tf_additional_gpu_compilation_copts",
 )
-load("//xla/tsl:tsl.bzl", "if_google", "if_nccl", "internal_visibility", "tsl_copts")
+load("//xla/tsl:tsl.bzl", "if_google", "if_hermetic_cuda_tools", "if_nccl", "internal_visibility", "tsl_copts")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -113,13 +113,31 @@ cuda_only_cc_library(
 
 # Buildozer can not remove dependencies inside select guards, so we have to use
 # an intermediate target.
-cc_library(name = "ptxas_wrapper")
+cc_library(
+    name = "ptxas_wrapper",
+    data = if_hermetic_cuda_tools(
+        ["@cuda_nvcc//:ptxas"],
+        [],
+    ),
+)
 
-cc_library(name = "nvlink_wrapper")
+cc_library(
+    name = "nvlink_wrapper",
+    data = if_hermetic_cuda_tools(
+        ["@cuda_nvcc//:nvlink"],
+        [],
+    ),
+)
 
 # Buildozer can not remove dependencies inside select guards, so we have to use
 # an intermediate target.
-cc_library(name = "fatbinary_wrapper")
+cc_library(
+    name = "fatbinary_wrapper",
+    data = if_hermetic_cuda_tools(
+        ["@cuda_nvcc//:fatbinary"],
+        [],
+    ),
+)
 
 cuda_only_cc_library(
     name = "cuda_driver",
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_hlo.hlo b/third_party/xla/xla/tools/hlo_opt/gpu_hlo.hlo
index 0633b7e5ef7ce8..8ab7b9039faf0f 100755
--- a/third_party/xla/xla/tools/hlo_opt/gpu_hlo.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_hlo.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/gpu_specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule module
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_backend.hlo b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_backend.hlo
index 61b6b9aa778b9e..66317c2d276c46 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_backend.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_backend.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=hlo-backend --xla_gpu_target_config_filename=%S/gpu_specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule module
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_buffers.hlo b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_buffers.hlo
index e7b8321cc6480b..a706e7c75df0d1 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_buffers.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_buffers.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=buffer-assignment --xla_gpu_target_config_filename=%S/gpu_specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule m
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_llvm.hlo b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_llvm.hlo
index 59800a9d170560..2eb00d4cac81b8 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_llvm.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_llvm.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm --xla_gpu_target_config_filename=%S/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 HloModule m
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_ptx.hlo b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_ptx.hlo
index 5c6485a57813a8..fae7ed1437107d 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_ptx.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_ptx.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=CUDA --stage=ptx --xla_gpu_target_config_filename=%S/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck %s
 
 HloModule m
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_unoptimized_llvm.hlo b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_unoptimized_llvm.hlo
index 6c8bc8bd54fe6a..63766db3e0b039 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_unoptimized_llvm.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_unoptimized_llvm.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/gpu_specs/%{GPU}.txtpb | FileCheck %s
 
 // CHECK:       fusion.in_bounds-true:
diff --git a/third_party/xla/xla/tsl/BUILD b/third_party/xla/xla/tsl/BUILD
index 8a7cb42086d5d5..47d112444da58e 100644
--- a/third_party/xla/xla/tsl/BUILD
+++ b/third_party/xla/xla/tsl/BUILD
@@ -1,7 +1,7 @@
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("@bazel_skylib//lib:selects.bzl", "selects")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag", "bool_setting")
-load("tsl.bzl", "if_google", "if_oss")
+load("tsl.bzl", "if_google", "if_hermetic_cuda_libs", "if_oss")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
@@ -551,3 +551,25 @@ cc_library(
     }),
 )
 # copybara:comment_end
+
+cc_library(
+    name = "gpu_runtime_hermetic_cuda_deps",
+    tags = ["manual"],
+    visibility = ["//visibility:public"],
+    deps = if_hermetic_cuda_libs([
+        "@cuda_cudart//:cudart",
+        "@cuda_cudnn//:cudnn",
+        "@cuda_cudnn//:cudnn_ops_infer",
+        "@cuda_cudnn//:cudnn_cnn_infer",
+        "@cuda_cudnn//:cudnn_ops_train",
+        "@cuda_cudnn//:cudnn_cnn_train",
+        "@cuda_cudnn//:cudnn_adv_infer",
+        "@cuda_cudnn//:cudnn_adv_train",
+        "@cuda_cublas//:cublas",
+        "@cuda_cublas//:cublasLt",
+        "@cuda_cusolver//:cusolver",
+        "@cuda_cufft//:cufft",
+        "@cuda_cusparse//:cusparse",
+        "@cuda_nvjitlink//:nvjitlink",
+    ]),
+)
diff --git a/third_party/xla/xla/tsl/cuda/BUILD.bazel b/third_party/xla/xla/tsl/cuda/BUILD.bazel
index 6f0e9aefab72f0..0992a9bcf1c775 100644
--- a/third_party/xla/xla/tsl/cuda/BUILD.bazel
+++ b/third_party/xla/xla/tsl/cuda/BUILD.bazel
@@ -10,6 +10,10 @@ load(
     "cuda_rpath_flags",
     "if_cuda_is_configured",
 )
+load(
+    "//xla/tsl:tsl.bzl",
+    "if_hermetic_cuda_libs",
+)
 load("//xla/tsl/cuda:stub.bzl", "cuda_stub")
 
 package(
@@ -41,6 +45,8 @@ cc_library(
         "@local_tsl//tsl/platform:dso_loader",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:load_library",
+    ]) + if_hermetic_cuda_libs([
+        "@cuda_cublas//:cublas",
     ]),
 )
 
@@ -65,6 +71,8 @@ cc_library(
         "@local_tsl//tsl/platform:dso_loader",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:load_library",
+    ]) + if_hermetic_cuda_libs([
+        "@cuda_cublas//:cublasLt",
     ]),
 )
 
@@ -126,7 +134,9 @@ cc_library(
             "@local_tsl//tsl/platform:logging",
         ],
         "//conditions:default": [],
-    }),
+    }) + if_hermetic_cuda_libs([
+        "@cuda_cudart//:cudart",
+    ]),
 )
 
 cuda_stub(
@@ -152,6 +162,14 @@ cc_library(
         "@local_tsl//tsl/platform:dso_loader",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:load_library",
+    ]) + if_hermetic_cuda_libs([
+        "@cuda_cudnn//:cudnn",
+        "@cuda_cudnn//:cudnn_ops_infer",
+        "@cuda_cudnn//:cudnn_cnn_infer",
+        "@cuda_cudnn//:cudnn_ops_train",
+        "@cuda_cudnn//:cudnn_cnn_train",
+        "@cuda_cudnn//:cudnn_adv_infer",
+        "@cuda_cudnn//:cudnn_adv_train",
     ]),
 )
 
@@ -189,6 +207,8 @@ cc_library(
         "@local_tsl//tsl/platform:dso_loader",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:load_library",
+    ]) + if_hermetic_cuda_libs([
+        "@cuda_cufft//:cufft",
     ]),
 )
 
@@ -216,6 +236,8 @@ cc_library(
         "@local_tsl//tsl/platform:dso_loader",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:load_library",
+    ]) + if_hermetic_cuda_libs([
+        "@cuda_cupti//:cupti",
     ]),
 )
 
@@ -241,6 +263,8 @@ cc_library(
         "@local_tsl//tsl/platform:dso_loader",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:load_library",
+    ]) + if_hermetic_cuda_libs([
+        "@cuda_cusolver//:cusolver",
     ]),
 )
 
@@ -266,6 +290,9 @@ cc_library(
         "@local_tsl//tsl/platform:dso_loader",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:load_library",
+    ]) + if_hermetic_cuda_libs([
+        "@cuda_cusparse//:cusparse",
+        "@cuda_nvjitlink//:nvjitlink",
     ]),
 )
 
@@ -293,5 +320,7 @@ cc_library(
         "@local_tsl//tsl/platform:dso_loader",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:load_library",
+    ]) + if_hermetic_cuda_libs([
+        "@cuda_nccl//:nccl",
     ]),
 )
diff --git a/third_party/xla/xla/tsl/tsl.bzl b/third_party/xla/xla/tsl/tsl.bzl
index 5ff893e03edda8..505a4deb8a4518 100644
--- a/third_party/xla/xla/tsl/tsl.bzl
+++ b/third_party/xla/xla/tsl/tsl.bzl
@@ -224,6 +224,17 @@ def if_with_tpu_support(if_true, if_false = []):
         "//conditions:default": if_false,
     })
 
+# These configs are used to determine whether we should use the hermetic CUDA
+# tools in cc_libraries (see go/hermetic-cuda).
+# They are intended for the OSS builds only.
+def if_hermetic_cuda_tools(if_true, if_false = []):
+    """Shorthand for select()'ing on whether we're building with hermetic CUDA tools."""
+    return select({"@local_config_cuda//cuda:hermetic_cuda_tools": if_true, "//conditions:default": if_false})  # copybara:comment_replace return if_false
+
+def if_hermetic_cuda_libs(if_true, if_false = []):
+    """Shorthand for select()'ing on whether we need to include hermetic CUDA libraries."""
+    return select({"@local_config_cuda//cuda:hermetic_cuda_tools_and_libs": if_true, "//conditions:default": if_false})  # copybara:comment_replace return if_false
+
 def get_win_copts(is_external = False):
     WINDOWS_COPTS = [
         # copybara:uncomment_begin(no MSVC flags in google)