From 9b69f8b86b98d5a47cadede34a8c7059a4af7f31 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 Mar 2024 10:09:31 -0700
Subject: [PATCH] Introduce hermetic CUDA in Google ML projects.

Instead of having pre-installed NVIDIA CUDA and CUDNN libraries and setting environment variables pointing to the installation locations, Bazel should automatically download CUDA and CUDNN distributives in the cache and use them during build and test phases.

The Bazel version used in JAX is bumped from 6.1.2 to 6.5.0.

PiperOrigin-RevId: 616865795
---
 .bazelrc                                      |  22 +-
 ci/official/wheel.sh                          |   3 +-
 configure.py                                  | 210 +------
 .../kernel_gen/tests/hlo_to_kernel/tanh.mlir  |   1 +
 tensorflow/core/common_runtime/gpu/BUILD      |   1 +
 tensorflow/opensource_only.files              |  21 +
 tensorflow/python/framework/BUILD             |   8 +
 tensorflow/tensorflow.bzl                     |   6 +-
 .../tools/pip_package/build_pip_package.py    |  28 +
 .../toolchains/remote_config/configs.bzl      |   8 +-
 .../toolchains/remote_config/rbe_config.bzl   |  26 +-
 tensorflow/workspace2.bzl                     |  47 +-
 tensorflow/workspace3.bzl                     |  30 +
 third_party/cuda_redist_json_repo.bzl         | 110 ++++
 third_party/cuda_repo.bzl                     | 327 ++++++++++
 third_party/gpus/compiler_common_tools.bzl    | 174 ++++++
 third_party/gpus/crosstool/BUILD.tpl          |  11 +-
 third_party/gpus/cuda/BUILD.hermetic.tpl      | 291 +++++++++
 third_party/gpus/cuda/BUILD.tpl               |  28 +-
 third_party/gpus/cuda/cuda_cccl.BUILD         |  12 +
 third_party/gpus/cuda/cuda_cublas.BUILD.tpl   |  33 +
 third_party/gpus/cuda/cuda_cudart.BUILD.tpl   |  34 ++
 third_party/gpus/cuda/cuda_cudnn.BUILD.tpl    |  65 ++
 third_party/gpus/cuda/cuda_cufft.BUILD.tpl    |  23 +
 third_party/gpus/cuda/cuda_cupti.BUILD.tpl    |  23 +
 third_party/gpus/cuda/cuda_curand.BUILD.tpl   |  23 +
 third_party/gpus/cuda/cuda_cusolver.BUILD.tpl |  25 +
 third_party/gpus/cuda/cuda_cusparse.BUILD.tpl |  23 +
 third_party/gpus/cuda/cuda_nccl.BUILD         |   7 +
 third_party/gpus/cuda/cuda_nvcc.BUILD         |  73 +++
 .../gpus/cuda/cuda_nvjitlink.BUILD.tpl        |  23 +
 third_party/gpus/cuda/cuda_nvml.BUILD         |  12 +
 third_party/gpus/cuda/cuda_nvprune.BUILD      |   9 +
 third_party/gpus/cuda/cuda_nvtx.BUILD         |  12 +
 third_party/gpus/cuda_configure.bzl           | 170 +-----
 third_party/gpus/hermetic_cuda_configure.bzl  | 570 ++++++++++++++++++
 third_party/gpus/rocm_configure.bzl           |   5 +-
 third_party/gpus/sycl_configure.bzl           |   5 +-
 third_party/nccl/build_defs.bzl.tpl           |  11 +-
 third_party/nccl/hermetic_nccl_configure.bzl  | 153 +++++
 third_party/nccl/nccl_configure.bzl           |   9 +-
 third_party/xla/.bazelrc                      |  22 +-
 .../xla/build_tools/configure/configure.py    | 113 +---
 .../build_tools/configure/configure_test.py   |   9 +-
 .../configure/testdata/cuda_clang.bazelrc     |   4 +-
 .../configure/testdata/nvcc_clang.bazelrc     |   4 +-
 .../configure/testdata/nvcc_gcc.bazelrc       |   4 +-
 third_party/xla/opensource_only.files         |   2 +
 .../xla/third_party/cuda_redist_json_repo.bzl | 110 ++++
 third_party/xla/third_party/cuda_repo.bzl     | 327 ++++++++++
 third_party/xla/third_party/tsl/.bazelrc      |  22 +-
 .../xla/third_party/tsl/opensource_only.files |  21 +
 .../tsl/third_party/cuda_redist_json_repo.bzl | 110 ++++
 .../third_party/tsl/third_party/cuda_repo.bzl | 327 ++++++++++
 .../gpus/compiler_common_tools.bzl            | 174 ++++++
 .../tsl/third_party/gpus/crosstool/BUILD.tpl  |  11 +-
 .../third_party/gpus/cuda/BUILD.hermetic.tpl  | 291 +++++++++
 .../tsl/third_party/gpus/cuda/BUILD.tpl       |  28 +-
 .../tsl/third_party/gpus/cuda/cuda_cccl.BUILD |  12 +
 .../gpus/cuda/cuda_cublas.BUILD.tpl           |  33 +
 .../gpus/cuda/cuda_cudart.BUILD.tpl           |  34 ++
 .../gpus/cuda/cuda_cudnn.BUILD.tpl            |  65 ++
 .../gpus/cuda/cuda_cufft.BUILD.tpl            |  23 +
 .../gpus/cuda/cuda_cupti.BUILD.tpl            |  23 +
 .../gpus/cuda/cuda_curand.BUILD.tpl           |  23 +
 .../gpus/cuda/cuda_cusolver.BUILD.tpl         |  25 +
 .../gpus/cuda/cuda_cusparse.BUILD.tpl         |  23 +
 .../tsl/third_party/gpus/cuda/cuda_nccl.BUILD |   7 +
 .../tsl/third_party/gpus/cuda/cuda_nvcc.BUILD |  73 +++
 .../gpus/cuda/cuda_nvjitlink.BUILD.tpl        |  23 +
 .../tsl/third_party/gpus/cuda/cuda_nvml.BUILD |  12 +
 .../third_party/gpus/cuda/cuda_nvprune.BUILD  |   9 +
 .../tsl/third_party/gpus/cuda/cuda_nvtx.BUILD |  12 +
 .../tsl/third_party/gpus/cuda_configure.bzl   | 170 +-----
 .../gpus/hermetic_cuda_configure.bzl          | 570 ++++++++++++++++++
 .../tsl/third_party/gpus/rocm_configure.bzl   |   5 +-
 .../tsl/third_party/gpus/sycl_configure.bzl   |   5 +-
 .../tsl/third_party/nccl/build_defs.bzl.tpl   |  11 +-
 .../nccl/hermetic_nccl_configure.bzl          | 153 +++++
 .../tsl/third_party/nccl/nccl_configure.bzl   |   9 +-
 .../toolchains/remote_config/configs.bzl      |   8 +-
 .../toolchains/remote_config/rbe_config.bzl   |  25 +-
 .../tsl/tsl/platform/default/BUILD            |   5 +
 .../platform/default/cuda_libdevice_path.cc   |  15 +
 .../xla/third_party/tsl/workspace2.bzl        |  47 +-
 .../xla/third_party/tsl/workspace3.bzl        |  30 +
 .../toolchains/remote_config/configs.bzl      |   8 +-
 .../toolchains/remote_config/rbe_config.bzl   |  25 +-
 third_party/xla/xla/service/BUILD             |   1 +
 .../xla/xla/service/gpu/tests/add_preds.hlo   |   1 +
 .../service/gpu/tests/calling_convention.hlo  |   1 +
 .../xla/xla/service/gpu/tests/copy.hlo        |   1 +
 .../tests/dynamic_update_slice_inplace.hlo    |   1 +
 .../tests/element_wise_row_vectorization.hlo  |   1 +
 .../xla/service/gpu/tests/fused_scatter.hlo   |   1 +
 .../xla/xla/service/gpu/tests/fused_slice.hlo |   1 +
 .../xla/service/gpu/tests/kernel_reuse.hlo    |   1 +
 .../service/gpu/tests/launch_dimensions.hlo   |   1 +
 .../xla/service/gpu/tests/pad_to_static.hlo   |   1 +
 .../service/gpu/tests/reduce_atomic_min.hlo   |   1 +
 .../gpu/tests/reduce_column_layout_change.hlo |   1 +
 .../service/gpu/tests/reduce_f64_column.hlo   |   1 +
 .../gpu/tests/reduce_large_row_to_scalar.hlo  |   1 +
 .../gpu/tests/reduce_row_vectorized.hlo       |   1 +
 .../xla/service/gpu/tests/reduce_unnested.hlo |   1 +
 .../gpu/tests/reduce_variadic_column.hlo      |   1 +
 .../tests/reduction_vectorization_sm_all.hlo  |   1 +
 .../gpu/tests/rng_get_and_update_state.hlo    |   1 +
 .../xla/xla/service/gpu/tests/scatter.hlo     |   1 +
 .../service/gpu/tests/select_and_scatter.hlo  |   1 +
 .../service/gpu/tests/single_instruction.hlo  |   1 +
 .../service/gpu/tests/slice_to_dynamic.hlo    |   1 +
 .../xla/service/gpu/tests/transpose_021.hlo   |   1 +
 .../gpu/tests/transpose_021_extra_output.hlo  |   1 +
 .../xla/service/gpu/tests/transpose_210.hlo   |   1 +
 .../gpu/tests/transpose_210_extra_output.hlo  |   1 +
 .../xla/service/gpu/tests/triton_naming.hlo   |   1 +
 .../xla/xla/stream_executor/cuda/BUILD        |  26 +-
 third_party/xla/xla/tools/hlo_opt/gpu_hlo.hlo |   1 +
 .../xla/xla/tools/hlo_opt/gpu_hlo_backend.hlo |   1 +
 .../xla/xla/tools/hlo_opt/gpu_hlo_buffers.hlo |   1 +
 .../xla/xla/tools/hlo_opt/gpu_hlo_llvm.hlo    |   1 +
 .../xla/xla/tools/hlo_opt/gpu_hlo_ptx.hlo     |   1 +
 .../hlo_opt/gpu_hlo_unoptimized_llvm.hlo      |   1 +
 third_party/xla/xla/tsl/BUILD                 |  24 +-
 third_party/xla/xla/tsl/cuda/BUILD.bazel      |  31 +-
 third_party/xla/xla/tsl/tsl.bzl               |  11 +
 127 files changed, 5095 insertions(+), 766 deletions(-)
 create mode 100644 third_party/cuda_redist_json_repo.bzl
 create mode 100644 third_party/cuda_repo.bzl
 create mode 100644 third_party/gpus/compiler_common_tools.bzl
 create mode 100644 third_party/gpus/cuda/BUILD.hermetic.tpl
 create mode 100644 third_party/gpus/cuda/cuda_cccl.BUILD
 create mode 100644 third_party/gpus/cuda/cuda_cublas.BUILD.tpl
 create mode 100644 third_party/gpus/cuda/cuda_cudart.BUILD.tpl
 create mode 100644 third_party/gpus/cuda/cuda_cudnn.BUILD.tpl
 create mode 100644 third_party/gpus/cuda/cuda_cufft.BUILD.tpl
 create mode 100644 third_party/gpus/cuda/cuda_cupti.BUILD.tpl
 create mode 100644 third_party/gpus/cuda/cuda_curand.BUILD.tpl
 create mode 100644 third_party/gpus/cuda/cuda_cusolver.BUILD.tpl
 create mode 100644 third_party/gpus/cuda/cuda_cusparse.BUILD.tpl
 create mode 100644 third_party/gpus/cuda/cuda_nccl.BUILD
 create mode 100644 third_party/gpus/cuda/cuda_nvcc.BUILD
 create mode 100644 third_party/gpus/cuda/cuda_nvjitlink.BUILD.tpl
 create mode 100644 third_party/gpus/cuda/cuda_nvml.BUILD
 create mode 100644 third_party/gpus/cuda/cuda_nvprune.BUILD
 create mode 100644 third_party/gpus/cuda/cuda_nvtx.BUILD
 create mode 100644 third_party/gpus/hermetic_cuda_configure.bzl
 create mode 100644 third_party/nccl/hermetic_nccl_configure.bzl
 create mode 100644 third_party/xla/third_party/cuda_redist_json_repo.bzl
 create mode 100644 third_party/xla/third_party/cuda_repo.bzl
 create mode 100644 third_party/xla/third_party/tsl/third_party/cuda_redist_json_repo.bzl
 create mode 100644 third_party/xla/third_party/tsl/third_party/cuda_repo.bzl
 create mode 100644 third_party/xla/third_party/tsl/third_party/gpus/compiler_common_tools.bzl
 create mode 100644 third_party/xla/third_party/tsl/third_party/gpus/cuda/BUILD.hermetic.tpl
 create mode 100644 third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cccl.BUILD
 create mode 100644 third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cublas.BUILD.tpl
 create mode 100644 third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cudart.BUILD.tpl
 create mode 100644 third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cudnn.BUILD.tpl
 create mode 100644 third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cufft.BUILD.tpl
 create mode 100644 third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cupti.BUILD.tpl
 create mode 100644 third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_curand.BUILD.tpl
 create mode 100644 third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cusolver.BUILD.tpl
 create mode 100644 third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cusparse.BUILD.tpl
 create mode 100644 third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nccl.BUILD
 create mode 100644 third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvcc.BUILD
 create mode 100644 third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvjitlink.BUILD.tpl
 create mode 100644 third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvml.BUILD
 create mode 100644 third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvprune.BUILD
 create mode 100644 third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvtx.BUILD
 create mode 100644 third_party/xla/third_party/tsl/third_party/gpus/hermetic_cuda_configure.bzl
 create mode 100644 third_party/xla/third_party/tsl/third_party/nccl/hermetic_nccl_configure.bzl

diff --git a/.bazelrc b/.bazelrc
index 02dec0349c4741..c17ae4494dc99c 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -226,12 +226,13 @@ build:cuda --repo_env TF_NEED_CUDA=1
 build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda --@local_config_cuda//:enable_cuda
 
+build:no_cuda_libs --@local_config_cuda//cuda:include_hermetic_cuda_libs=false
+
 # CUDA: This config refers to building CUDA op kernels with clang.
 build:cuda_clang --config=cuda
-# Enable TensorRT optimizations https://developer.nvidia.com/tensorrt
-build:cuda_clang --config=tensorrt
 build:cuda_clang --action_env=TF_CUDA_CLANG="1"
 build:cuda_clang --@local_config_cuda//:cuda_compiler=clang
+build:cuda_clang --copt=-Qunused-arguments
 # Select supported compute capabilities (supported graphics cards).
 # This is the same as the official TensorFlow builds.
 # See https://developer.nvidia.com/cuda-gpus#compute
@@ -244,12 +245,10 @@ build:cuda_clang --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_60,sm_70,sm_80,sm_8
 
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 build:cuda_clang_official --config=cuda_clang
-build:cuda_clang_official --action_env=TF_CUDA_VERSION="12"
-build:cuda_clang_official --action_env=TF_CUDNN_VERSION="8"
-build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.3"
+build:cuda_clang_official --action_env=TF_CUDA_VERSION="12.3"
+build:cuda_clang_official --action_env=TF_CUDNN_VERSION="8.9"
 build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
-build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 build:cuda_clang_official --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
 
 # Build with nvcc for CUDA and clang for host
@@ -545,10 +544,6 @@ build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
 # For Remote build execution -- GPU configuration
 build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
-build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.17-clang_config_cuda"
-build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.17-clang_config_tensorrt"
-build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.17-clang_config_nccl"
-test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
 build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
 build:rbe_linux_cuda_nvcc --config=nvcc_clang
@@ -633,7 +628,6 @@ build:release_cpu_linux_base --repo_env=BAZEL_COMPILER="/usr/lib/llvm-17/bin/cla
 # Test-related settings below this point.
 test:release_linux_base --build_tests_only --keep_going --test_output=errors --verbose_failures=true
 test:release_linux_base --local_test_jobs=HOST_CPUS
-test:release_linux_base --test_env=LD_LIBRARY_PATH
 # Give only the list of failed tests at the end of the log
 test:release_linux_base --test_summary=short
 
@@ -645,7 +639,6 @@ build:release_gpu_linux --config=release_cpu_linux
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 # Note that linux cpu and cuda builds share the same toolchain now.
 build:release_gpu_linux --config=cuda_clang_official
-test:release_gpu_linux --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 # Local test jobs has to be 4 because parallel_gpu_execute is fragile, I think
 test:release_gpu_linux --test_timeout=300,450,1200,3600 --local_test_jobs=4 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute
 
@@ -674,11 +667,8 @@ test:unsupported_cpu_linux --config=release_base
 build:unsupported_gpu_linux --config=cuda
 build:unsupported_gpu_linux --config=unsupported_cpu_linux
 build:unsupported_gpu_linux --action_env=TF_CUDA_VERSION="11"
-build:unsupported_gpu_linux --action_env=TF_CUDNN_VERSION="8"
+build:unsupported_gpu_linux --action_env=TF_CUDNN_VERSION="8.6"
 build:unsupported_gpu_linux --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_50,sm_60,sm_70,sm_75,compute_80"
-build:unsupported_gpu_linux --config=tensorrt
-build:unsupported_gpu_linux --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-11.2"
-build:unsupported_gpu_linux --action_env=LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.1/lib64:/usr/local/tensorrt/lib"
 build:unsupported_gpu_linux --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:unsupported_gpu_linux --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain
 
diff --git a/ci/official/wheel.sh b/ci/official/wheel.sh
index e2a828bfacce35..5ae0feca48f646 100755
--- a/ci/official/wheel.sh
+++ b/ci/official/wheel.sh
@@ -27,7 +27,8 @@ if [[ "$TFCI_NIGHTLY_UPDATE_VERSION_ENABLE" == 1 ]]; then
   export TFCI_BUILD_PIP_PACKAGE_ARGS="$(echo $TFCI_BUILD_PIP_PACKAGE_ARGS | sed 's/tensorflow/tf_nightly/')"
 fi
 
-tfrun bazel build $TFCI_BAZEL_COMMON_ARGS //tensorflow/tools/pip_package:wheel $TFCI_BUILD_PIP_PACKAGE_ARGS
+tfrun bazel build $TFCI_BAZEL_COMMON_ARGS --config=no_cuda_libs \
+//tensorflow/tools/pip_package:wheel $TFCI_BUILD_PIP_PACKAGE_ARGS
 tfrun find ./bazel-bin/tensorflow/tools/pip_package -iname "*.whl" -exec cp {} $TFCI_OUTPUT_DIR \;
 tfrun ./ci/official/utilities/rename_and_verify_wheels.sh
 
diff --git a/configure.py b/configure.py
index 0081eeabf66bcc..01e7f343872797 100644
--- a/configure.py
+++ b/configure.py
@@ -16,7 +16,6 @@
 
 import argparse
 import errno
-import glob
 import json
 import os
 import platform
@@ -239,7 +238,7 @@ def setup_python(environ_cp):
   write_to_bazelrc('build --python_path=\"{}"'.format(python_bin_path))
   environ_cp['PYTHON_BIN_PATH'] = python_bin_path
 
-  # If choosen python_lib_path is from a path specified in the PYTHONPATH
+  # If chosen python_lib_path is from a path specified in the PYTHONPATH
   # variable, need to tell bazel to include PYTHONPATH
   if environ_cp.get('PYTHONPATH'):
     python_paths = environ_cp.get('PYTHONPATH').split(':')
@@ -775,11 +774,6 @@ def get_ndk_api_level(environ_cp, android_ndk_home_path):
 def set_gcc_host_compiler_path(environ_cp):
   """Set GCC_HOST_COMPILER_PATH."""
   default_gcc_host_compiler_path = which('gcc') or ''
-  cuda_bin_symlink = '%s/bin/gcc' % environ_cp.get('CUDA_TOOLKIT_PATH')
-
-  if os.path.islink(cuda_bin_symlink):
-    # os.readlink is only available in linux
-    default_gcc_host_compiler_path = os.path.realpath(cuda_bin_symlink)
 
   gcc_host_compiler_path = prompt_loop_or_load_from_env(
       environ_cp,
@@ -937,17 +931,6 @@ def disable_clang_offsetof_extension(clang_version):
     write_to_bazelrc('build --copt=-Wno-gnu-offsetof-extensions')
 
 
-def set_tf_cuda_paths(environ_cp):
-  """Set TF_CUDA_PATHS."""
-  ask_cuda_paths = (
-      'Please specify the comma-separated list of base paths to look for CUDA '
-      'libraries and headers. [Leave empty to use the default]: ')
-  tf_cuda_paths = get_from_env_or_user_or_default(environ_cp, 'TF_CUDA_PATHS',
-                                                  ask_cuda_paths, '')
-  if tf_cuda_paths:
-    environ_cp['TF_CUDA_PATHS'] = tf_cuda_paths
-
-
 def set_tf_cuda_version(environ_cp):
   """Set TF_CUDA_VERSION."""
   ask_cuda_version = (
@@ -972,73 +955,10 @@ def set_tf_cudnn_version(environ_cp):
   environ_cp['TF_CUDNN_VERSION'] = tf_cudnn_version
 
 
-def set_tf_tensorrt_version(environ_cp):
-  """Set TF_TENSORRT_VERSION."""
-  if not (is_linux() or is_windows()):
-    raise ValueError('Currently TensorRT is only supported on Linux platform.')
-
-  if not int(environ_cp.get('TF_NEED_TENSORRT', False)):
-    return
-
-  ask_tensorrt_version = (
-      'Please specify the TensorRT version you want to use. '
-      '[Leave empty to default to TensorRT %s]: ') % _DEFAULT_TENSORRT_VERSION
-  tf_tensorrt_version = get_from_env_or_user_or_default(
-      environ_cp, 'TF_TENSORRT_VERSION', ask_tensorrt_version,
-      _DEFAULT_TENSORRT_VERSION)
-  environ_cp['TF_TENSORRT_VERSION'] = tf_tensorrt_version
-
-
-def set_tf_nccl_version(environ_cp):
-  """Set TF_NCCL_VERSION."""
-  if not is_linux():
-    raise ValueError('Currently NCCL is only supported on Linux platform.')
-
-  if 'TF_NCCL_VERSION' in environ_cp:
-    return
-
-  ask_nccl_version = (
-      'Please specify the locally installed NCCL version you want to use. '
-      '[Leave empty to use http://github.com/nvidia/nccl]: ')
-  tf_nccl_version = get_from_env_or_user_or_default(environ_cp,
-                                                    'TF_NCCL_VERSION',
-                                                    ask_nccl_version, '')
-  environ_cp['TF_NCCL_VERSION'] = tf_nccl_version
-
-
-def get_native_cuda_compute_capabilities(environ_cp):
-  """Get native cuda compute capabilities.
-
-  Args:
-    environ_cp: copy of the os.environ.
-
-  Returns:
-    string of native cuda compute capabilities, separated by comma.
-  """
-  device_query_bin = os.path.join(
-      environ_cp.get('CUDA_TOOLKIT_PATH'), 'extras/demo_suite/deviceQuery')
-  if os.path.isfile(device_query_bin) and os.access(device_query_bin, os.X_OK):
-    try:
-      output = run_shell(device_query_bin).split('\n')
-      pattern = re.compile('[0-9]*\\.[0-9]*')
-      output = [pattern.search(x) for x in output if 'Capability' in x]
-      output = ','.join(x.group() for x in output if x is not None)
-    except subprocess.CalledProcessError:
-      output = ''
-  else:
-    output = ''
-  return output
-
-
 def set_tf_cuda_compute_capabilities(environ_cp):
   """Set TF_CUDA_COMPUTE_CAPABILITIES."""
   while True:
-    native_cuda_compute_capabilities = get_native_cuda_compute_capabilities(
-        environ_cp)
-    if not native_cuda_compute_capabilities:
-      default_cuda_compute_capabilities = _DEFAULT_CUDA_COMPUTE_CAPABILITIES
-    else:
-      default_cuda_compute_capabilities = native_cuda_compute_capabilities
+    default_cuda_compute_capabilities = _DEFAULT_CUDA_COMPUTE_CAPABILITIES
 
     ask_cuda_compute_capabilities = (
         'Please specify a list of comma-separated CUDA compute capabilities '
@@ -1217,73 +1137,6 @@ def configure_ios(environ_cp):
     symlink_force(filepath, new_filepath)
 
 
-def validate_cuda_config(environ_cp):
-  """Run find_cuda_config.py and return cuda_toolkit_path, or None."""
-
-  def maybe_encode_env(env):
-    """Encodes unicode in env to str on Windows python 2.x."""
-    if not is_windows() or sys.version_info[0] != 2:
-      return env
-    for k, v in env.items():
-      if isinstance(k, unicode):
-        k = k.encode('ascii')
-      if isinstance(v, unicode):
-        v = v.encode('ascii')
-      env[k] = v
-    return env
-
-  cuda_libraries = ['cuda', 'cudnn']
-  if is_linux():
-    if int(environ_cp.get('TF_NEED_TENSORRT', False)):
-      cuda_libraries.append('tensorrt')
-    if environ_cp.get('TF_NCCL_VERSION', None):
-      cuda_libraries.append('nccl')
-  if is_windows():
-    if int(environ_cp.get('TF_NEED_TENSORRT', False)):
-      cuda_libraries.append('tensorrt')
-      print('WARNING: TensorRT support on Windows is experimental\n')
-
-  paths = glob.glob('**/third_party/gpus/find_cuda_config.py', recursive=True)
-  if not paths:
-    raise FileNotFoundError(
-        "Can't find 'find_cuda_config.py' script inside working directory")
-  proc = subprocess.Popen(
-      [environ_cp['PYTHON_BIN_PATH'], paths[0]] + cuda_libraries,
-      stdout=subprocess.PIPE,
-      env=maybe_encode_env(environ_cp))
-
-  if proc.wait():
-    # Errors from find_cuda_config.py were sent to stderr.
-    print('Asking for detailed CUDA configuration...\n')
-    return False
-
-  config = dict(
-      tuple(line.decode('ascii').rstrip().split(': ')) for line in proc.stdout)
-
-  print('Found CUDA %s in:' % config['cuda_version'])
-  print('    %s' % config['cuda_library_dir'])
-  print('    %s' % config['cuda_include_dir'])
-
-  print('Found cuDNN %s in:' % config['cudnn_version'])
-  print('    %s' % config['cudnn_library_dir'])
-  print('    %s' % config['cudnn_include_dir'])
-
-  if 'tensorrt_version' in config:
-    print('Found TensorRT %s in:' % config['tensorrt_version'])
-    print('    %s' % config['tensorrt_library_dir'])
-    print('    %s' % config['tensorrt_include_dir'])
-
-  if config.get('nccl_version', None):
-    print('Found NCCL %s in:' % config['nccl_version'])
-    print('    %s' % config['nccl_library_dir'])
-    print('    %s' % config['nccl_include_dir'])
-
-  print('\n')
-
-  environ_cp['CUDA_TOOLKIT_PATH'] = config['cuda_toolkit_path']
-  return True
-
-
 def get_gcc_compiler(environ_cp):
   gcc_env = environ_cp.get('CXX') or environ_cp.get('CC') or which('gcc')
   if gcc_env is not None:
@@ -1388,57 +1241,20 @@ def main():
   if (environ_cp.get('TF_NEED_CUDA') == '1' and
       'TF_CUDA_CONFIG_REPO' not in environ_cp):
 
-    set_action_env_var(
-        environ_cp,
-        'TF_NEED_TENSORRT',
-        'TensorRT',
-        False,
-        bazel_config_name='tensorrt')
-
-    environ_save = dict(environ_cp)
     for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
 
-      if validate_cuda_config(environ_cp):
-        cuda_env_names = [
-            'TF_CUDA_VERSION',
-            'TF_CUBLAS_VERSION',
-            'TF_CUDNN_VERSION',
-            'TF_TENSORRT_VERSION',
-            'TF_NCCL_VERSION',
-            'TF_CUDA_PATHS',
-            # Items below are for backwards compatibility when not using
-            # TF_CUDA_PATHS.
-            'CUDA_TOOLKIT_PATH',
-            'CUDNN_INSTALL_PATH',
-            'NCCL_INSTALL_PATH',
-            'NCCL_HDR_PATH',
-            'TENSORRT_INSTALL_PATH'
-        ]
-        # Note: set_action_env_var above already writes to bazelrc.
-        for name in cuda_env_names:
-          if name in environ_cp:
-            write_action_env_to_bazelrc(name, environ_cp[name])
-        break
-
-      # Restore settings changed below if CUDA config could not be validated.
-      environ_cp = dict(environ_save)
-
-      set_tf_cuda_version(environ_cp)
-      set_tf_cudnn_version(environ_cp)
-      if is_windows():
-        set_tf_tensorrt_version(environ_cp)
-      if is_linux():
-        set_tf_tensorrt_version(environ_cp)
-        set_tf_nccl_version(environ_cp)
-
-      set_tf_cuda_paths(environ_cp)
+      cuda_env_names = [
+          'TF_CUDA_VERSION',
+          'TF_CUDNN_VERSION',
+      ]
+      # Note: set_action_env_var above already writes to bazelrc.
+      for name in cuda_env_names:
+        if name in environ_cp:
+          write_action_env_to_bazelrc(name, environ_cp[name])
+      break
 
-    else:
-      raise UserInputError(
-          'Invalid CUDA setting were provided %d '
-          'times in a row. Assuming to be a scripting mistake.'
-          % _DEFAULT_PROMPT_ASK_ATTEMPTS
-      )
+    set_tf_cuda_version(environ_cp)
+    set_tf_cudnn_version(environ_cp)
 
     set_tf_cuda_compute_capabilities(environ_cp)
     if 'LD_LIBRARY_PATH' in environ_cp and environ_cp.get(
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/tanh.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/tanh.mlir
index 2d3c8e6f5b9ef7..67f5a036fccc32 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/tanh.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/tanh.mlir
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../../../../cuda_nvcc"
 // RUN: hlo_to_kernel --input=%s --output=%t --unroll_factors=4 --tile_sizes=256 --arch=sm_70
 
 func.func @tanh(%arg0: tensor<*xf32>) -> tensor<*xf32> attributes {tf_entry} {
diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index 84ec94ba673ff8..6de9910ca2233f 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -158,6 +158,7 @@ tf_cuda_library(
         "@local_config_cuda//cuda:cudnn_header",
         "@local_xla//xla/stream_executor/cuda:cuda_platform",
         "@local_xla//xla/stream_executor/gpu:gpu_stream",
+        "@local_xla//xla/tsl:gpu_runtime_hermetic_cuda_deps",
     ],
     defines = if_linux_x86_64(["TF_PLATFORM_LINUX_X86_64"]),
     features = ["-layering_check"],
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index b2645a331739e3..7c3947fa49e861 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -207,6 +207,8 @@ tf_staging/third_party/compute_library/BUILD:
 tf_staging/third_party/compute_library/build_defs.bzl:
 tf_staging/third_party/coremltools.BUILD:
 tf_staging/third_party/cub.BUILD:
+tf_staging/third_party/cuda_redist_json_repo.bzl:
+tf_staging/third_party/cuda_repo.bzl:
 tf_staging/third_party/curl.BUILD:
 tf_staging/third_party/cython.BUILD:
 tf_staging/third_party/ducc/BUILD:
@@ -233,6 +235,7 @@ tf_staging/third_party/googleapis/build_rules.bzl:
 tf_staging/third_party/googleapis/googleapis.BUILD:
 tf_staging/third_party/googleapis/repository_rules.bzl:
 tf_staging/third_party/gpus/BUILD:
+tf_staging/third_party/gpus/compiler_common_tools.bzl:
 tf_staging/third_party/gpus/crosstool/BUILD.rocm.tpl:
 tf_staging/third_party/gpus/crosstool/BUILD.sycl.tpl:
 tf_staging/third_party/gpus/crosstool/BUILD.tpl:
@@ -243,15 +246,32 @@ tf_staging/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tp
 tf_staging/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_sycl.tpl:
 tf_staging/third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl:
 tf_staging/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl:
+tf_staging/third_party/gpus/cuda/BUILD.hermetic.tpl:
 tf_staging/third_party/gpus/cuda/BUILD.tpl:
 tf_staging/third_party/gpus/cuda/BUILD.windows.tpl:
 tf_staging/third_party/gpus/cuda/BUILD:
 tf_staging/third_party/gpus/cuda/LICENSE:
 tf_staging/third_party/gpus/cuda/build_defs.bzl.tpl:
+tf_staging/third_party/gpus/cuda/cuda_cccl.BUILD:
 tf_staging/third_party/gpus/cuda/cuda_config.h.tpl:
 tf_staging/third_party/gpus/cuda/cuda_config.py.tpl:
+tf_staging/third_party/gpus/cuda/cuda_cublas.BUILD.tpl:
+tf_staging/third_party/gpus/cuda/cuda_cudart.BUILD.tpl:
+tf_staging/third_party/gpus/cuda/cuda_cudnn.BUILD.tpl:
+tf_staging/third_party/gpus/cuda/cuda_cufft.BUILD.tpl:
+tf_staging/third_party/gpus/cuda/cuda_cupti.BUILD.tpl:
+tf_staging/third_party/gpus/cuda/cuda_curand.BUILD.tpl:
+tf_staging/third_party/gpus/cuda/cuda_cusolver.BUILD.tpl:
+tf_staging/third_party/gpus/cuda/cuda_cusparse.BUILD.tpl:
+tf_staging/third_party/gpus/cuda/cuda_nccl.BUILD:
+tf_staging/third_party/gpus/cuda/cuda_nvcc.BUILD:
+tf_staging/third_party/gpus/cuda/cuda_nvjitlink.BUILD.tpl:
+tf_staging/third_party/gpus/cuda/cuda_nvml.BUILD:
+tf_staging/third_party/gpus/cuda/cuda_nvprune.BUILD:
+tf_staging/third_party/gpus/cuda/cuda_nvtx.BUILD:
 tf_staging/third_party/gpus/cuda_configure.bzl:
 tf_staging/third_party/gpus/find_cuda_config:.py
+tf_staging/third_party/gpus/hermetic_cuda_configure.bzl:
 tf_staging/third_party/gpus/rocm/BUILD.tpl:
 tf_staging/third_party/gpus/rocm/BUILD:
 tf_staging/third_party/gpus/rocm/build_defs.bzl.tpl:
@@ -283,6 +303,7 @@ tf_staging/third_party/nccl/archive.BUILD:
 tf_staging/third_party/nccl/archive.patch:
 tf_staging/third_party/nccl/build_defs.bzl.tpl:
 tf_staging/third_party/nccl/generated_names.bzl.tpl:
+tf_staging/third_party/nccl/hermetic_nccl_configure.bzl:
 tf_staging/third_party/nccl/nccl_configure.bzl:
 tf_staging/third_party/nccl/system.BUILD.tpl:
 tf_staging/third_party/nlohmann_json.BUILD:
diff --git a/tensorflow/python/framework/BUILD b/tensorflow/python/framework/BUILD
index bac9403d63dc27..eff65990c58c38 100644
--- a/tensorflow/python/framework/BUILD
+++ b/tensorflow/python/framework/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 # Placeholder: load py_proto_library
 load(
     "//tensorflow:tensorflow.bzl",
+    "if_hermetic_cuda_tools",
     "if_not_windows",
     "if_oss",
     "if_xla_available",
@@ -1045,6 +1046,13 @@ tf_python_pybind_extension(
         "python_api_dispatcher.h",
         "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
     ],
+    # This data is needed to add hermetic CUDA tools in python runfiles.
+    data = if_hermetic_cuda_tools(
+        [
+            "@cuda_nvcc//:ptxas",
+            "@cuda_nvcc//:nvvm",
+        ],
+    ),
     enable_stub_generation = True,
     pytype_srcs = [
         "_pywrap_python_api_dispatcher.pyi",
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 6e656b861bedaf..0bf6a91f995007 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -73,6 +73,7 @@ load(
     "tsl_gpu_library",
     _cc_header_only_library = "cc_header_only_library",
     _if_cuda_or_rocm = "if_cuda_or_rocm",
+    _if_hermetic_cuda_tools = "if_hermetic_cuda_tools",
     _if_nccl = "if_nccl",
     _transitive_hdrs = "transitive_hdrs",
 )
@@ -803,7 +804,7 @@ def tf_cc_shared_object(
     testonly = kwargs.pop("testonly", False)
 
     for name_os, name_os_major, name_os_full in names:
-        # Windows DLLs cant be versioned
+        # Windows DLLs can't be versioned
         if name_os.endswith(".dll"):
             name_os_major = name_os
             name_os_full = name_os
@@ -3578,3 +3579,6 @@ def replace_with_portable_tf_lib_when_required(non_portable_tf_deps, use_lib_wit
 
 def tf_python_framework_friends():
     return ["//tensorflow:__subpackages__"]
+
+def if_hermetic_cuda_tools(if_true, if_false = []):
+    return _if_hermetic_cuda_tools(if_true, if_false)
diff --git a/tensorflow/tools/pip_package/build_pip_package.py b/tensorflow/tools/pip_package/build_pip_package.py
index 10593503395532..6ad413acca561d 100644
--- a/tensorflow/tools/pip_package/build_pip_package.py
+++ b/tensorflow/tools/pip_package/build_pip_package.py
@@ -69,6 +69,34 @@ def prepare_headers(headers: list[str], srcs_dir: str) -> None:
     srcs_dir: target directory where headers are copied to.
   """
   path_to_exclude = [
+      "cuda_cccl/_virtual_includes",
+      "cuda_cublas/_virtual_includes",
+      "cuda_cudart/_virtual_includes",
+      "cuda_cudnn/_virtual_includes",
+      "cuda_cufft/_virtual_includes",
+      "cuda_cupti/_virtual_includes",
+      "cuda_curand/_virtual_includes",
+      "cuda_cusolver/_virtual_includes",
+      "cuda_cusparse/_virtual_includes",
+      "cuda_nccl/_virtual_includes",
+      "cuda_nvcc/_virtual_includes",
+      "cuda_nvjitlink/_virtual_includes",
+      "cuda_nvml/_virtual_includes",
+      "cuda_nvtx/_virtual_includes",
+      "external/cuda_cccl",
+      "external/cuda_cublas",
+      "external/cuda_cudart",
+      "external/cuda_cudnn",
+      "external/cuda_cufft",
+      "external/cuda_cupti",
+      "external/cuda_curand",
+      "external/cuda_cusolver",
+      "external/cuda_cusparse",
+      "external/cuda_nccl",
+      "external/cuda_nvcc",
+      "external/cuda_nvjitlink",
+      "external/cuda_nvml",
+      "external/cuda_nvtx",
       "external/pypi",
       "external/jsoncpp_git/src",
       "local_config_cuda/cuda/_virtual_includes",
diff --git a/tensorflow/tools/toolchains/remote_config/configs.bzl b/tensorflow/tools/toolchains/remote_config/configs.bzl
index 41fe389aa09cad..48473d49e84ce2 100644
--- a/tensorflow/tools/toolchains/remote_config/configs.bzl
+++ b/tensorflow/tools/toolchains/remote_config/configs.bzl
@@ -703,7 +703,7 @@ def initialize_rbe_configs():
             "TF_CUDNN_VERSION": "8.9",
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_SYSROOT": "/dt9",
             "TF_TENSORRT_VERSION": "8.6",
         },
@@ -742,7 +742,7 @@ def initialize_rbe_configs():
             "TF_CUDNN_VERSION": "8.9",
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_SYSROOT": "/dt9",
             "TF_TENSORRT_VERSION": "8.6",
         },
@@ -782,7 +782,7 @@ def initialize_rbe_configs():
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
             "TF_SYSROOT": "/dt9",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_TENSORRT_VERSION": "8.6",
         },
     )
@@ -820,7 +820,7 @@ def initialize_rbe_configs():
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
             "TF_SYSROOT": "/dt9",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_TENSORRT_VERSION": "8.6",
         },
     )
diff --git a/tensorflow/tools/toolchains/remote_config/rbe_config.bzl b/tensorflow/tools/toolchains/remote_config/rbe_config.bzl
index ae776c2a2fd388..317e7139e5f96b 100644
--- a/tensorflow/tools/toolchains/remote_config/rbe_config.bzl
+++ b/tensorflow/tools/toolchains/remote_config/rbe_config.bzl
@@ -1,9 +1,15 @@
 """Macro that creates external repositories for remote config."""
 
 load("//tensorflow/tools/toolchains/remote_config:containers.bzl", "containers")
-load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
+
+# If you need to use non-hermetic CUDA, replace the line below with
+# load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
+load("//third_party/gpus:hermetic_cuda_configure.bzl", "hermetic_cuda_configure")
 load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
-load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
+
+# If you need to use non-hermetic CUDA, replace the line below with
+# load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
+load("//third_party/nccl:hermetic_nccl_configure.bzl", "hermetic_nccl_configure")
 load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
 load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//third_party/tensorrt:tensorrt_configure.bzl", "remote_tensorrt_configure")
@@ -42,7 +48,7 @@ def _tensorflow_rbe_config(name, compiler, python_versions, os, rocm_version = N
             "TF_CUDNN_VERSION": cudnn_version,
             "TF_CUDA_VERSION": cuda_version,
             "CUDNN_INSTALL_PATH": cudnn_install_path if cudnn_install_path != None else "/usr/lib/x86_64-linux-gnu",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_TENSORRT_VERSION": tensorrt_version if tensorrt_version != None else "",
             "TENSORRT_INSTALL_PATH": tensorrt_install_path if tensorrt_install_path != None else "/usr/lib/x86_64-linux-gnu",
             "GCC_HOST_COMPILER_PATH": compiler if not compiler.endswith("clang") else "",
@@ -58,13 +64,17 @@ def _tensorflow_rbe_config(name, compiler, python_versions, os, rocm_version = N
             "Pool": "default",
         }
 
-        remote_cuda_configure(
+        # If you need to use non-hermetic CUDA, replace the call below with
+        # remote_cuda_configure.
+        hermetic_cuda_configure(
             name = "%s_config_cuda" % name,
             environ = env,
             exec_properties = exec_properties,
         )
 
-        remote_nccl_configure(
+        # If you need to use non-hermetic CUDA, replace the call below with
+        # remote_nccl_configure.
+        hermetic_nccl_configure(
             name = "%s_config_nccl" % name,
             environ = env,
             exec_properties = exec_properties,
@@ -175,13 +185,15 @@ def sigbuild_tf_configs(name_container_map, env):
             "Pool": "default",
         }
 
-        remote_cuda_configure(
+        # TODO (ybaturina): DEPRECATED: replace with remote_cuda_configure for non-hermetic CUDA.
+        hermetic_cuda_configure(
             name = "%s_config_cuda" % name,
             environ = env,
             exec_properties = exec_properties,
         )
 
-        remote_nccl_configure(
+        # TODO (ybaturina): DEPRECATED: replace with remote_nccl_configure for non-hermetic NCCL.
+        hermetic_nccl_configure(
             name = "%s_config_nccl" % name,
             environ = env,
             exec_properties = exec_properties,
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index 77eea2ac869167..c041ad5d1abbbc 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -15,6 +15,7 @@ load("//tensorflow/tools/toolchains/clang6:repo.bzl", "clang6_configure")
 load("//tensorflow/tools/toolchains/embedded/arm-linux:arm_linux_toolchain_configure.bzl", "arm_linux_toolchain_configure")
 load("//tensorflow/tools/toolchains/remote:configure.bzl", "remote_execution_configure")
 load("//tensorflow/tools/toolchains/remote_config:configs.bzl", "initialize_rbe_configs")
+load("//third_party:cuda_repo.bzl", "cuda_distributives")
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 load("//third_party/absl:workspace.bzl", absl = "repo")
 load("//third_party/benchmark:workspace.bzl", benchmark = "repo")
@@ -29,7 +30,10 @@ load("//third_party/flatbuffers:workspace.bzl", flatbuffers = "repo")
 load("//third_party/FP16:workspace.bzl", FP16 = "repo")
 load("//third_party/gemmlowp:workspace.bzl", gemmlowp = "repo")
 load("//third_party/git:git_configure.bzl", "git_configure")
-load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
+
+# If you need to use non-hermetic CUDA, replace the line below with
+# load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
+load("//third_party/gpus:hermetic_cuda_configure.bzl", "hermetic_cuda_configure")
 load("//third_party/gpus:rocm_configure.bzl", "rocm_configure")
 load("//third_party/hexagon:workspace.bzl", hexagon_nn = "repo")
 load("//third_party/highwayhash:workspace.bzl", highwayhash = "repo")
@@ -41,7 +45,10 @@ load("//third_party/kissfft:workspace.bzl", kissfft = "repo")
 load("//third_party/libprotobuf_mutator:workspace.bzl", libprotobuf_mutator = "repo")
 load("//third_party/llvm:setup.bzl", "llvm_setup")
 load("//third_party/nasm:workspace.bzl", nasm = "repo")
-load("//third_party/nccl:nccl_configure.bzl", "nccl_configure")
+
+# If you need to use non-hermetic CUDA, replace the line below with
+# load("//third_party/nccl:nccl_configure.bzl", "nccl_configure")
+load("//third_party/nccl:hermetic_nccl_configure.bzl", "hermetic_nccl_configure")
 load("//third_party/opencl_headers:workspace.bzl", opencl_headers = "repo")
 load("//third_party/pasta:workspace.bzl", pasta = "repo")
 load("//third_party/py:python_configure.bzl", "python_configure")
@@ -103,9 +110,15 @@ def _tf_toolchains():
     # Note that we check the minimum bazel version in WORKSPACE.
     clang6_configure(name = "local_config_clang6")
     cc_download_clang_toolchain(name = "local_config_download_clang")
-    cuda_configure(name = "local_config_cuda")
+
+    # If you need to use non-hermetic CUDA, replace the line below with
+    # cuda_configure(name = "local_config_cuda")
+    hermetic_cuda_configure(name = "local_config_cuda")
     tensorrt_configure(name = "local_config_tensorrt")
-    nccl_configure(name = "local_config_nccl")
+
+    # If you need to use non-hermetic CUDA, replace the line below with
+    # nccl_configure(name = "local_config_nccl")
+    hermetic_nccl_configure(name = "local_config_nccl")
     git_configure(name = "local_config_git")
     syslibs_configure(name = "local_config_syslibs")
     python_configure(name = "local_config_python")
@@ -919,6 +932,28 @@ def _tf_repositories():
         version_conflict_policy = "pinned",
     )
 
+_CUDA_12_3_NCCL_WHEEL_DICT = {
+    "x86_64-unknown-linux-gnu": {
+        "url": "https://files.pythonhosted.org/packages/38/00/d0d4e48aef772ad5aebcf70b73028f88db6e5640b36c38e90445b7a57c45/nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl",
+        "sha256": "a9734707a2c96443331c1e48c717024aa6678a0e2a4cb66b2c364d18cee6b48d",
+    },
+    "aarch64-unknown-linux-gnu": {
+        "url": "https://files.pythonhosted.org/packages/c1/bb/d09dda47c881f9ff504afd6f9ca4f502ded6d8fc2f572cacc5e39da91c28/nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_aarch64.whl",
+        "sha256": "1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01",
+    },
+}
+
+_CUDA_12_1_NCCL_WHEEL_DICT = {
+    "x86_64-unknown-linux-gnu": {
+        "url": "https://files.pythonhosted.org/packages/38/00/d0d4e48aef772ad5aebcf70b73028f88db6e5640b36c38e90445b7a57c45/nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl",
+        "sha256": "a9734707a2c96443331c1e48c717024aa6678a0e2a4cb66b2c364d18cee6b48d",
+    },
+    "aarch64-unknown-linux-gnu": {
+        "url": "https://files.pythonhosted.org/packages/c1/bb/d09dda47c881f9ff504afd6f9ca4f502ded6d8fc2f572cacc5e39da91c28/nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_aarch64.whl",
+        "sha256": "1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01",
+    },
+}
+
 def workspace():
     # Check the bazel version before executing any repository rules, in case
     # those rules rely on the version we require here.
@@ -936,6 +971,10 @@ def workspace():
     # don't already exist (at least if the external repository macros were
     # written according to common practice to query native.existing_rule()).
     _tf_repositories()
+    cuda_distributives(cuda_nccl_wheel_dict = {
+        "12.3.2": _CUDA_12_3_NCCL_WHEEL_DICT,
+        "12.1.1": _CUDA_12_1_NCCL_WHEEL_DICT,
+    })
 
     tfrt_dependencies()
 
diff --git a/tensorflow/workspace3.bzl b/tensorflow/workspace3.bzl
index d7b32f01c7144d..3632d49ade844f 100644
--- a/tensorflow/workspace3.bzl
+++ b/tensorflow/workspace3.bzl
@@ -1,10 +1,33 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("//third_party:cuda_redist_json_repo.bzl", "cuda_redist_json")
 load("//third_party:repo.bzl", "tf_vendored")
 load("//third_party/llvm:workspace.bzl", llvm = "repo")
 load("//third_party/tf_runtime:workspace.bzl", tf_runtime = "repo")
 
+_CUDA_REDIST_JSON_DICT = {
+    "12.1.1": [
+        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.1.1.json",
+        "bafea3cb83a4cf5c764eeedcaac0040d0d3c5db3f9a74550da0e7b6ac24d378c",
+    ],
+    "12.3.2": [
+        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.3.2.json",
+        "1b6eacf335dd49803633fed53ef261d62c193e5a56eee5019e7d2f634e39e7ef",
+    ],
+}
+
+_CUDNN_REDIST_JSON_DICT = {
+    "8.6": [
+        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_8.6.0.json",
+        "7f6f50bed4fd8216dc10d6ef505771dc0ecc99cce813993ab405cb507a21d51d",
+    ],
+    "8.9.7.29": [
+        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_8.9.7.29.json",
+        "a0734f26f068522464fa09b2f2c186dfbe6ad7407a88ea0c50dd331f0c3389ec",
+    ],
+}
+
 def workspace():
     tf_vendored(name = "local_xla", relpath = "third_party/xla")
     tf_vendored(name = "local_tsl", relpath = "third_party/xla/third_party/tsl")
@@ -62,6 +85,13 @@ def workspace():
     # but provides a script for setting up build rules via overlays.
     llvm("llvm-raw")
 
+    # Load JSON files for CUDA and cuDNN distribution versions.
+    cuda_redist_json(
+        name = "cuda_redist_json",
+        cuda_json_dict = _CUDA_REDIST_JSON_DICT,
+        cudnn_json_dict = _CUDNN_REDIST_JSON_DICT,
+    )
+
 # Alias so it can be loaded without assigning to a different symbol to prevent
 # shadowing previous loads and trigger a buildifier warning.
 tf_workspace3 = workspace
diff --git a/third_party/cuda_redist_json_repo.bzl b/third_party/cuda_redist_json_repo.bzl
new file mode 100644
index 00000000000000..76941dd74c9488
--- /dev/null
+++ b/third_party/cuda_redist_json_repo.bzl
@@ -0,0 +1,110 @@
+# Copyright 2024 The TensorFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for defining CUDA and cuDNN JSON files with distributives versions."""
+
+load("//third_party:repo.bzl", "tf_mirror_urls")
+
+_DEFAULT_CUDA_VERSION = "12.3.2"
+_DEFAULT_CUDNN_VERSION = "8.9.7.29"
+
+def _get_env_var(ctx, name):
+    if name in ctx.os.environ:
+        return ctx.os.environ[name]
+    else:
+        return None
+
+def _cuda_redist_json_impl(repository_ctx):
+    cuda_version = _get_env_var(repository_ctx, "TF_CUDA_VERSION")
+    cudnn_version = _get_env_var(repository_ctx, "TF_CUDNN_VERSION")
+    supported_cuda_versions = repository_ctx.attr.cuda_json_dict.keys()
+    if cuda_version and (cuda_version not in supported_cuda_versions):
+        if cuda_version in ["12", "12.3"]:
+            cuda_version = _DEFAULT_CUDA_VERSION
+        else:
+            fail(
+                ("The supported CUDA versions are {supported_versions}." +
+                 " Please provide a supported version in TF_CUDA_VERSION" +
+                 " environment variable or add JSON URL for" +
+                 " CUDA version={version}.")
+                    .format(
+                    supported_versions = supported_cuda_versions,
+                    version = cuda_version,
+                ),
+            )
+    supported_cudnn_versions = repository_ctx.attr.cudnn_json_dict.keys()
+    if cudnn_version and (cudnn_version not in supported_cudnn_versions):
+        if cudnn_version in ["8", "8.9"]:
+            cudnn_version = _DEFAULT_CUDNN_VERSION
+        else:
+            fail(
+                ("The supported CUDNN versions are {supported_versions}." +
+                 " Please provide a supported version in TF_CUDNN_VERSION" +
+                 " environment variable or add JSON URL for" +
+                 " CUDNN version={version}.")
+                    .format(
+                    supported_versions = supported_cudnn_versions,
+                    version = cudnn_version,
+                ),
+            )
+    cuda_distributives = "{}"
+    cudnn_distributives = "{}"
+    if cuda_version:
+        (url, sha256) = repository_ctx.attr.cuda_json_dict[cuda_version]
+        json_file_name = "redistrib_cuda_%s.json" % cuda_version
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            sha256 = sha256,
+            output = json_file_name,
+        )
+        cuda_distributives = repository_ctx.read(repository_ctx.path(json_file_name))
+    if cudnn_version:
+        (url, sha256) = repository_ctx.attr.cudnn_json_dict[cudnn_version]
+        json_file_name = "redistrib_cudnn_%s.json" % cudnn_version
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            sha256 = sha256,
+            output = json_file_name,
+        )
+        cudnn_distributives = repository_ctx.read(repository_ctx.path(json_file_name))
+
+    repository_ctx.file(
+        "build_defs.bzl",
+        """def get_cuda_distributives():
+               return {cuda_distributives}
+
+def get_cudnn_distributives():
+    return {cudnn_distributives}
+""".format(cuda_distributives = cuda_distributives, cudnn_distributives = cudnn_distributives),
+    )
+    repository_ctx.file(
+        "BUILD",
+        "",
+    )
+
+_cuda_redist_json = repository_rule(
+    implementation = _cuda_redist_json_impl,
+    attrs = {
+        "cuda_json_dict": attr.string_list_dict(mandatory = True),
+        "cudnn_json_dict": attr.string_list_dict(mandatory = True),
+    },
+    environ = ["TF_CUDA_VERSION", "TF_CUDNN_VERSION"],
+)
+
+def cuda_redist_json(name, cuda_json_dict, cudnn_json_dict):
+    _cuda_redist_json(
+        name = name,
+        cuda_json_dict = cuda_json_dict,
+        cudnn_json_dict = cudnn_json_dict,
+    )
diff --git a/third_party/cuda_repo.bzl b/third_party/cuda_repo.bzl
new file mode 100644
index 00000000000000..0dcda26cba3a23
--- /dev/null
+++ b/third_party/cuda_repo.bzl
@@ -0,0 +1,327 @@
+# Copyright 2024 The TensorFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for defining Google ML CUDA dependencies."""
+
+load("@cuda_redist_json//:build_defs.bzl", "get_cuda_distributives", "get_cudnn_distributives")
+load("//third_party:repo.bzl", "tf_mirror_urls")
+
+_DEFAULT_CUDA_VERSION = "12.3.2"
+_OS_ARCH_DICT = {
+    "amd64": "x86_64-unknown-linux-gnu",
+    "aarch64": "aarch64-unknown-linux-gnu",
+}
+_REDIST_ARCH_DICT = {
+    "linux-x86_64": "x86_64-unknown-linux-gnu",
+    "linux-sbsa": "aarch64-unknown-linux-gnu",
+}
+
+_CUDA_DIST_PATH_PREFIX = "https://developer.download.nvidia.com/compute/cuda/redist/"
+_CUDNN_DIST_PATH_PREFIX = "https://developer.download.nvidia.com/compute/cudnn/redist/"
+
+def _get_env_var(ctx, name):
+    if name in ctx.os.environ:
+        return ctx.os.environ[name]
+    else:
+        return None
+
+def _get_archive_name(url, archive_suffix = ".tar.xz"):
+    last_slash_index = url.rfind("/")
+    return url[last_slash_index + 1:-len(archive_suffix)]
+
+def _cuda_http_archive_impl(repository_ctx):
+    cuda_or_cudnn_version = None
+    dist_version = ""
+    cuda_version = _get_env_var(repository_ctx, "TF_CUDA_VERSION")
+    cudnn_version = _get_env_var(repository_ctx, "TF_CUDNN_VERSION")
+    if repository_ctx.attr.is_cudnn_dist:
+        cuda_or_cudnn_version = cudnn_version
+    else:
+        cuda_or_cudnn_version = cuda_version
+    if cuda_or_cudnn_version:
+        # Download archive only when GPU config is used.
+        dist_version = repository_ctx.attr.dist_version
+        arch = _OS_ARCH_DICT[repository_ctx.os.arch]
+        if arch not in repository_ctx.attr.relative_url_dict.keys():
+            (relative_url, sha256) = repository_ctx.attr.relative_url_dict["cuda{version}_{arch}" \
+                .format(
+                version = cuda_version.split(".")[0],
+                arch = arch,
+            )]
+        else:
+            (relative_url, sha256) = repository_ctx.attr.relative_url_dict[arch]
+        url = (repository_ctx.attr.cudnn_dist_path_prefix if repository_ctx.attr.is_cudnn_dist else repository_ctx.attr.cuda_dist_path_prefix) + relative_url
+
+        archive_name = _get_archive_name(url)
+
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            output = archive_name + repository_ctx.attr.extension,
+            sha256 = sha256,
+        )
+        repository_ctx.extract(
+            archive = archive_name + repository_ctx.attr.extension,
+            stripPrefix = repository_ctx.attr.override_strip_prefix if repository_ctx.attr.override_strip_prefix else archive_name,
+        )
+    if repository_ctx.attr.build_template:
+        version = dist_version.split(".")[0] if dist_version else ""
+        repository_ctx.file("version.txt", version)
+        repository_ctx.template(
+            "BUILD",
+            repository_ctx.attr.build_template,
+            {"%{version}": version},
+        )
+    else:
+        repository_ctx.file(
+            "BUILD",
+            repository_ctx.read(repository_ctx.attr.build_file),
+        )
+
+_cuda_http_archive = repository_rule(
+    implementation = _cuda_http_archive_impl,
+    attrs = {
+        "dist_version": attr.string(mandatory = True),
+        "relative_url_dict": attr.string_list_dict(mandatory = True),
+        "build_template": attr.label(),
+        "build_file": attr.label(),
+        "is_cudnn_dist": attr.bool(),
+        "override_strip_prefix": attr.string(),
+        "cudnn_dist_path_prefix": attr.string(default = _CUDNN_DIST_PATH_PREFIX),
+        "cuda_dist_path_prefix": attr.string(default = _CUDA_DIST_PATH_PREFIX),
+        "extension": attr.string(default = ".tar.xz"),
+    },
+    environ = ["TF_CUDA_VERSION", "TF_CUDNN_VERSION"],
+)
+
+def cuda_http_archive(name, dist_version, relative_url_dict, **kwargs):
+    _cuda_http_archive(
+        name = name,
+        dist_version = dist_version,
+        relative_url_dict = relative_url_dict,
+        **kwargs
+    )
+
+def _cuda_wheel_impl(repository_ctx):
+    cuda_version = _get_env_var(repository_ctx, "TF_CUDA_VERSION")
+    if cuda_version in ["12", "12.3"]:
+        cuda_version = _DEFAULT_CUDA_VERSION
+    if cuda_version:
+        # Download archive only when GPU config is used.
+        arch = _OS_ARCH_DICT[repository_ctx.os.arch]
+        dict_key = "{cuda_version}-{arch}".format(
+            cuda_version = cuda_version,
+            arch = arch,
+        )
+        supported_versions = repository_ctx.attr.url_dict.keys()
+        if dict_key not in supported_versions:
+            fail(
+                ("The supported NCCL versions are {supported_versions}." +
+                 " Please provide a supported CUDA version in TF_CUDA_VERSION" +
+                 " environment variable or add NCCL distributive for" +
+                 " CUDA version={version}, OS={arch}.")
+                    .format(
+                    supported_versions = supported_versions,
+                    version = cuda_version,
+                    arch = arch,
+                ),
+            )
+        sha256 = repository_ctx.attr.sha256_dict[dict_key]
+        url = repository_ctx.attr.url_dict[dict_key]
+
+        archive_name = _get_archive_name(url, archive_suffix = ".whl")
+
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            output = archive_name + repository_ctx.attr.extension,
+            sha256 = sha256,
+        )
+        repository_ctx.extract(
+            archive = archive_name + repository_ctx.attr.extension,
+            stripPrefix = repository_ctx.attr.strip_prefix,
+        )
+
+    repository_ctx.file(
+        "BUILD",
+        repository_ctx.read(repository_ctx.attr.build_file),
+    )
+
+_cuda_wheel = repository_rule(
+    implementation = _cuda_wheel_impl,
+    attrs = {
+        "sha256_dict": attr.string_dict(mandatory = True),
+        "url_dict": attr.string_dict(mandatory = True),
+        "build_file": attr.label(),
+        "strip_prefix": attr.string(),
+        "extension": attr.string(default = ".zip"),
+    },
+    environ = ["TF_CUDA_VERSION"],
+)
+
+def cuda_wheel(name, sha256_dict, url_dict, **kwargs):
+    _cuda_wheel(
+        name = name,
+        sha256_dict = sha256_dict,
+        url_dict = url_dict,
+        **kwargs
+    )
+
+def _get_relative_url_dict(dist_info):
+    relative_url_dict = {}
+    for arch in _REDIST_ARCH_DICT.keys():
+        # CUDNN JSON might contain paths for each CUDA version.
+        if "relative_path" not in dist_info[arch]:
+            for cuda_version, data in dist_info[arch].items():
+                relative_url_dict["{cuda_version}_{arch}" \
+                    .format(
+                    cuda_version = cuda_version,
+                    arch = _REDIST_ARCH_DICT[arch],
+                )] = [data["relative_path"], data["sha256"]]
+        else:
+            relative_url_dict[_REDIST_ARCH_DICT[arch]] = [
+                dist_info[arch]["relative_path"],
+                dist_info[arch]["sha256"],
+            ]
+    return relative_url_dict
+
+def _get_cuda_archive(
+        repo_name,
+        dist_dict,
+        dist_name,
+        build_file = None,
+        build_template = None,
+        is_cudnn_dist = False):
+    if dist_name in dist_dict.keys():
+        return cuda_http_archive(
+            name = repo_name,
+            dist_version = dist_dict[dist_name]["version"],
+            build_file = Label(build_file) if build_file else None,
+            build_template = Label(build_template) if build_template else None,
+            relative_url_dict = _get_relative_url_dict(dist_dict[dist_name]),
+            is_cudnn_dist = is_cudnn_dist,
+        )
+    else:
+        return cuda_http_archive(
+            name = repo_name,
+            dist_version = "",
+            build_file = Label(build_file) if build_file else None,
+            build_template = Label(build_template) if build_template else None,
+            relative_url_dict = {"": []},
+            is_cudnn_dist = is_cudnn_dist,
+        )
+
+def cuda_distributives(cuda_nccl_wheel_dict):
+    nccl_artifacts_dict = {"sha256_dict": {}, "url_dict": {}}
+    for cuda_version, nccl_wheel_info in cuda_nccl_wheel_dict.items():
+        for arch in _OS_ARCH_DICT.values():
+            if arch in nccl_wheel_info.keys():
+                cuda_version_to_arch_key = "%s-%s" % (cuda_version, arch)
+                nccl_artifacts_dict["sha256_dict"][cuda_version_to_arch_key] = nccl_wheel_info[arch]["sha256"]
+                nccl_artifacts_dict["url_dict"][cuda_version_to_arch_key] = nccl_wheel_info[arch]["url"]
+
+    cuda_wheel(
+        name = "cuda_nccl",
+        sha256_dict = nccl_artifacts_dict["sha256_dict"],
+        url_dict = nccl_artifacts_dict["url_dict"],
+        build_file = Label("//third_party/gpus/cuda:cuda_nccl.BUILD"),
+        strip_prefix = "nvidia/nccl",
+    )
+
+    cuda_distributives = get_cuda_distributives()
+    cudnn_distributives = get_cudnn_distributives()
+
+    _get_cuda_archive(
+        repo_name = "cuda_cccl",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_cccl",
+        build_file = "//third_party/gpus/cuda:cuda_cccl.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cublas",
+        dist_dict = cuda_distributives,
+        dist_name = "libcublas",
+        build_template = "//third_party/gpus/cuda:cuda_cublas.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cudart",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_cudart",
+        build_template = "//third_party/gpus/cuda:cuda_cudart.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cudnn",
+        dist_dict = cudnn_distributives,
+        dist_name = "cudnn",
+        build_template = "//third_party/gpus/cuda:cuda_cudnn.BUILD.tpl",
+        is_cudnn_dist = True,
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cufft",
+        dist_dict = cuda_distributives,
+        dist_name = "libcufft",
+        build_template = "//third_party/gpus/cuda:cuda_cufft.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cupti",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_cupti",
+        build_template = "//third_party/gpus/cuda:cuda_cupti.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_curand",
+        dist_dict = cuda_distributives,
+        dist_name = "libcurand",
+        build_template = "//third_party/gpus/cuda:cuda_curand.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cusolver",
+        dist_dict = cuda_distributives,
+        dist_name = "libcusolver",
+        build_template = "//third_party/gpus/cuda:cuda_cusolver.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cusparse",
+        dist_dict = cuda_distributives,
+        dist_name = "libcusparse",
+        build_template = "//third_party/gpus/cuda:cuda_cusparse.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvcc",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvcc",
+        build_file = "//third_party/gpus/cuda:cuda_nvcc.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvjitlink",
+        dist_dict = cuda_distributives,
+        dist_name = "libnvjitlink",
+        build_template = "//third_party/gpus/cuda:cuda_nvjitlink.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvml",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvml_dev",
+        build_file = "//third_party/gpus/cuda:cuda_nvml.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvprune",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvprune",
+        build_file = "//third_party/gpus/cuda:cuda_nvprune.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvtx",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvtx",
+        build_file = "//third_party/gpus/cuda:cuda_nvtx.BUILD",
+    )
diff --git a/third_party/gpus/compiler_common_tools.bzl b/third_party/gpus/compiler_common_tools.bzl
new file mode 100644
index 00000000000000..bd07f49ec457bb
--- /dev/null
+++ b/third_party/gpus/compiler_common_tools.bzl
@@ -0,0 +1,174 @@
+"""Common compiler functions. """
+
+load(
+    "//third_party/remote_config:common.bzl",
+    "err_out",
+    "raw_exec",
+    "realpath",
+)
+
+def to_list_of_strings(elements):
+    """Convert the list of ["a", "b", "c"] into '"a", "b", "c"'.
+
+    This is to be used to put a list of strings into the bzl file templates
+    so it gets interpreted as list of strings in Starlark.
+
+    Args:
+      elements: list of string elements
+
+    Returns:
+      single string of elements wrapped in quotes separated by a comma."""
+    quoted_strings = ["\"" + element + "\"" for element in elements]
+    return ", ".join(quoted_strings)
+
+_INC_DIR_MARKER_BEGIN = "#include <...>"
+
+# OSX add " (framework directory)" at the end of line, strip it.
+_OSX_FRAMEWORK_SUFFIX = " (framework directory)"
+_OSX_FRAMEWORK_SUFFIX_LEN = len(_OSX_FRAMEWORK_SUFFIX)
+
+# TODO(dzc): Once these functions have been factored out of Bazel's
+# cc_configure.bzl, load them from @bazel_tools instead.
+def _cxx_inc_convert(path):
+    """Convert path returned by cc -E xc++ in a complete path."""
+    path = path.strip()
+    if path.endswith(_OSX_FRAMEWORK_SUFFIX):
+        path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
+    return path
+
+def _normalize_include_path(repository_ctx, path):
+    """Normalizes include paths before writing them to the crosstool.
+
+      If path points inside the 'crosstool' folder of the repository, a relative
+      path is returned.
+      If path points outside the 'crosstool' folder, an absolute path is returned.
+    """
+    path = str(repository_ctx.path(path))
+    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
+
+    if path.startswith(crosstool_folder):
+        # We drop the path to "$REPO/crosstool" and a trailing path separator.
+        return path[len(crosstool_folder) + 1:]
+    return path
+
+def _is_compiler_option_supported(repository_ctx, cc, option):
+    """Checks that `option` is supported by the C compiler. Doesn't %-escape the option."""
+    result = repository_ctx.execute([
+        cc,
+        option,
+        "-o",
+        "/dev/null",
+        "-c",
+        str(repository_ctx.path("tools/cpp/empty.cc")),
+    ])
+    return result.stderr.find(option) == -1
+
+def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp, tf_sys_root):
+    """Compute the list of default C or C++ include directories."""
+    if lang_is_cpp:
+        lang = "c++"
+    else:
+        lang = "c"
+    sysroot = []
+    if tf_sys_root:
+        sysroot += ["--sysroot", tf_sys_root]
+    result = raw_exec(repository_ctx, [cc, "-E", "-x" + lang, "-", "-v"] +
+                                      sysroot)
+    stderr = err_out(result)
+    index1 = stderr.find(_INC_DIR_MARKER_BEGIN)
+    if index1 == -1:
+        return []
+    index1 = stderr.find("\n", index1)
+    if index1 == -1:
+        return []
+    index2 = stderr.rfind("\n ")
+    if index2 == -1 or index2 < index1:
+        return []
+    index2 = stderr.find("\n", index2 + 1)
+    if index2 == -1:
+        inc_dirs = stderr[index1 + 1:]
+    else:
+        inc_dirs = stderr[index1 + 1:index2].strip()
+
+    print_resource_dir_supported = _is_compiler_option_supported(
+        repository_ctx,
+        cc,
+        "-print-resource-dir",
+    )
+
+    if print_resource_dir_supported:
+        resource_dir = repository_ctx.execute(
+            [cc, "-print-resource-dir"],
+        ).stdout.strip() + "/share"
+        inc_dirs += "\n" + resource_dir
+
+    compiler_includes = [
+        _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
+        for p in inc_dirs.split("\n")
+    ]
+
+    # The compiler might be on a symlink, e.g. /symlink -> /opt/gcc
+    # The above keeps only the resolved paths to the default includes (e.g. /opt/gcc/include/c++/11)
+    # but Bazel might encounter either (usually reported by the compiler)
+    # especially when a compiler wrapper (e.g. ccache) is used.
+    # So we need to also include paths where symlinks are not resolved.
+
+    # Try to find real path to CC installation to "see through" compiler wrappers
+    # GCC has the path to g++
+    index1 = result.stderr.find("COLLECT_GCC=")
+    if index1 != -1:
+        index1 = result.stderr.find("=", index1)
+        index2 = result.stderr.find("\n", index1)
+        cc_topdir = repository_ctx.path(result.stderr[index1 + 1:index2]).dirname.dirname
+    else:
+        # Clang has the directory
+        index1 = result.stderr.find("InstalledDir: ")
+        if index1 != -1:
+            index1 = result.stderr.find(" ", index1)
+            index2 = result.stderr.find("\n", index1)
+            cc_topdir = repository_ctx.path(result.stderr[index1 + 1:index2]).dirname
+        else:
+            # Fallback to the CC path
+            cc_topdir = repository_ctx.path(cc).dirname.dirname
+
+    # We now have the compiler installation prefix, e.g. /symlink/gcc
+    # And the resolved installation prefix, e.g. /opt/gcc
+    cc_topdir_resolved = str(realpath(repository_ctx, cc_topdir)).strip()
+    cc_topdir = str(cc_topdir).strip()
+
+    # If there is (any!) symlink involved we add paths where the unresolved installation prefix is kept.
+    # e.g. [/opt/gcc/include/c++/11, /opt/gcc/lib/gcc/x86_64-linux-gnu/11/include, /other/path]
+    # adds [/symlink/include/c++/11, /symlink/lib/gcc/x86_64-linux-gnu/11/include]
+    if cc_topdir_resolved != cc_topdir:
+        unresolved_compiler_includes = [
+            cc_topdir + inc[len(cc_topdir_resolved):]
+            for inc in compiler_includes
+            if inc.startswith(cc_topdir_resolved)
+        ]
+        compiler_includes = compiler_includes + unresolved_compiler_includes
+    return compiler_includes
+
+def get_cxx_inc_directories(repository_ctx, cc, tf_sys_root):
+    """Compute the list of default C and C++ include directories."""
+
+    # For some reason `clang -xc` sometimes returns include paths that are
+    # different from the ones from `clang -xc++`. (Symlink and a dir)
+    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
+    includes_cpp = _get_cxx_inc_directories_impl(
+        repository_ctx,
+        cc,
+        True,
+        tf_sys_root,
+    )
+    includes_c = _get_cxx_inc_directories_impl(
+        repository_ctx,
+        cc,
+        False,
+        tf_sys_root,
+    )
+
+    return includes_cpp + [
+        inc
+        for inc in includes_c
+        if inc not in includes_cpp
+    ]
diff --git a/third_party/gpus/crosstool/BUILD.tpl b/third_party/gpus/crosstool/BUILD.tpl
index 8eda7a1cf6ac2b..b9553d9b99ecfe 100644
--- a/third_party/gpus/crosstool/BUILD.tpl
+++ b/third_party/gpus/crosstool/BUILD.tpl
@@ -2,6 +2,7 @@
 # Update cuda_configure.bzl#verify_build_defines when adding new variables.
 
 load(":cc_toolchain_config.bzl", "cc_toolchain_config")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
 licenses(["restricted"])
 
@@ -133,9 +134,17 @@ filegroup(
     srcs = [],
 )
 
+filegroup(
+    name = "cuda_nvcc_files",
+    srcs = %{cuda_nvcc_files},
+)
+
 filegroup(
     name = "crosstool_wrapper_driver_is_not_gcc",
-    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
+    srcs = [
+      ":cuda_nvcc_files",
+      ":clang/bin/crosstool_wrapper_driver_is_not_gcc"
+    ],
 )
 
 filegroup(
diff --git a/third_party/gpus/cuda/BUILD.hermetic.tpl b/third_party/gpus/cuda/BUILD.hermetic.tpl
new file mode 100644
index 00000000000000..1c00f1c5e32916
--- /dev/null
+++ b/third_party/gpus/cuda/BUILD.hermetic.tpl
@@ -0,0 +1,291 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("@bazel_skylib//lib:selects.bzl", "selects")
+load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
+
+licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+
+package(default_visibility = ["//visibility:public"])
+
+# Config setting whether TensorFlow is built with CUDA support using clang.
+#
+# TODO(b/174244321), DEPRECATED: this target will be removed when all users
+# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_clang.
+selects.config_setting_group(
+    name = "using_clang",
+    match_all = [
+        "@local_config_cuda//:is_cuda_enabled",
+        "@local_config_cuda//:is_cuda_compiler_clang",
+    ],
+)
+
+# Config setting whether TensorFlow is built with CUDA support using nvcc.
+#
+# TODO(b/174244321), DEPRECATED: this target will be removed when all users
+# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_nvcc.
+selects.config_setting_group(
+    name = "using_nvcc",
+    match_all = [
+        "@local_config_cuda//:is_cuda_enabled",
+        "@local_config_cuda//:is_cuda_compiler_nvcc",
+    ],
+)
+
+# Equivalent to using_clang && -c opt.
+selects.config_setting_group(
+    name = "using_clang_opt",
+    match_all = [
+        ":using_clang",
+        ":_opt",
+    ],
+)
+
+config_setting(
+    name = "_opt",
+    values = {"compilation_mode": "opt"},
+)
+
+# Provides CUDA headers for '#include "third_party/gpus/cuda/include/cuda.h"'
+# All clients including TensorFlow should use these directives.
+cc_library(
+    name = "cuda_headers",
+    hdrs = [
+        "cuda/cuda_config.h",
+    ],
+    include_prefix = "third_party/gpus",
+    includes = [
+        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
+    ],
+    deps = [":cudart_headers",
+            ":cublas_headers",
+            ":cccl_headers",
+            ":nvtx_headers",
+            ":nvcc_headers",
+            ":nvjitlink_headers",
+            ":cusolver_headers",
+            ":cufft_headers",
+            ":cusparse_headers",
+            ":curand_headers",
+            ":cupti_headers",
+            ":nvml_headers"],
+)
+
+cc_library(
+    name = "cudart_static",
+    srcs = ["@cuda_cudart//:static"],
+    linkopts = [
+        "-ldl",
+        "-lpthread",
+        %{cudart_static_linkopt}
+    ],
+)
+
+alias(
+  name = "cuda_driver",
+  actual = "@cuda_cudart//:cuda_driver",
+)
+
+alias(
+  name = "cudart_headers",
+  actual = "@cuda_cudart//:headers",
+)
+
+alias(
+  name = "cudart",
+  actual = "@cuda_cudart//:cudart",
+)
+
+alias(
+  name = "nvjitlink_headers",
+  actual = "@cuda_nvjitlink//:headers",
+)
+
+alias(
+  name = "nvjitlink",
+  actual = "@cuda_nvjitlink//:nvjitlink",
+)
+
+alias(
+  name = "nvtx_headers",
+  actual = "@cuda_nvtx//:headers",
+)
+
+alias(
+  name = "nvml_headers",
+  actual = "@cuda_nvml//:headers",
+)
+
+alias(
+  name = "nvcc_headers",
+  actual = "@cuda_nvcc//:headers",
+)
+
+alias(
+  name = "cccl_headers",
+  actual = "@cuda_cccl//:headers",
+)
+
+alias(
+  name = "cublas_headers",
+  actual = "@cuda_cublas//:headers",
+)
+
+alias(
+  name = "cusolver_headers",
+  actual = "@cuda_cusolver//:headers",
+)
+
+alias(
+  name = "cufft_headers",
+  actual = "@cuda_cufft//:headers",
+)
+
+alias(
+  name = "cusparse_headers",
+  actual = "@cuda_cusparse//:headers",
+)
+
+alias(
+  name = "curand_headers",
+  actual = "@cuda_curand//:headers",
+)
+
+alias(
+  name = "cublas",
+  actual = "@cuda_cublas//:cublas",
+)
+
+alias(
+  name = "cublasLt",
+  actual = "@cuda_cublas//:cublasLt",
+)
+
+alias(
+  name = "cusolver",
+  actual = "@cuda_cusolver//:cusolver",
+)
+
+alias(
+  name = "cudnn",
+  actual = "@cuda_cudnn//:cudnn",
+)
+
+alias(
+  name = "cudnn_ops_infer",
+  actual = "@cuda_cudnn//:cudnn_ops_infer",
+)
+
+alias(
+  name = "cudnn_cnn_infer",
+  actual = "@cuda_cudnn//:cudnn_cnn_infer",
+)
+
+alias(
+  name = "cudnn_ops_train",
+  actual = "@cuda_cudnn//:cudnn_ops_train",
+)
+
+alias(
+  name = "cudnn_cnn_train",
+  actual = "@cuda_cudnn//:cudnn_cnn_train",
+)
+
+alias(
+  name = "cudnn_adv_infer",
+  actual = "@cuda_cudnn//:cudnn_adv_infer",
+)
+
+alias(
+  name = "cudnn_adv_train",
+  actual = "@cuda_cudnn//:cudnn_adv_train",
+)
+alias(
+  name = "cudnn_header",
+  actual = "@cuda_cudnn//:headers",
+)
+
+alias(
+  name = "cufft",
+  actual = "@cuda_cufft//:cufft",
+)
+
+alias(
+  name = "curand",
+  actual = "@cuda_curand//:curand",
+)
+
+cc_library(
+    name = "cuda",
+    deps = [
+        ":cublas",
+        ":cublasLt",
+        ":cuda_headers",
+        ":cudart",
+        ":cudnn",
+        ":cufft",
+        ":curand",
+    ],
+)
+
+alias(
+    name = "cub_headers",
+    actual = "%{cub_actual}",
+)
+
+alias(
+  name = "cupti_headers",
+  actual = "@cuda_cupti//:headers",
+)
+
+alias(
+  name = "cupti_dsos",
+  actual = "@cuda_cupti//:cupti",
+)
+
+alias(
+  name = "cusparse",
+  actual = "@cuda_cusparse//:cusparse",
+)
+
+cc_library(
+    name = "libdevice_root",
+    data = ["@cuda_nvcc//:nvvm"],
+)
+
+bzl_library(
+    name = "build_defs_bzl",
+    srcs = ["build_defs.bzl"],
+    deps = [
+        "@bazel_skylib//lib:selects",
+    ],
+)
+
+py_library(
+    name = "cuda_config_py",
+    srcs = ["cuda/cuda_config.py"],
+)
+
+# Config setting whether TensorFlow is built with hermetic CUDA.
+alias(
+    name = "hermetic_cuda_tools",
+    actual = "@local_config_cuda//:is_cuda_enabled",
+)
+
+# Flag indicating if we should include hermetic CUDA libs.
+bool_flag(
+    name = "include_hermetic_cuda_libs",
+    build_setting_default = True,
+)
+
+config_setting(
+    name = "hermetic_cuda_libs",
+    flag_values = {":include_hermetic_cuda_libs": "True"},
+)
+
+selects.config_setting_group(
+    name = "hermetic_cuda_tools_and_libs",
+    match_all = [
+        ":hermetic_cuda_libs",
+        ":hermetic_cuda_tools"
+    ],
+)
+
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index 90a18b90de048c..a4264cc14890e5 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -1,6 +1,7 @@
 load(":build_defs.bzl", "cuda_header_library")
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("@bazel_skylib//lib:selects.bzl", "selects")
+load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 
 licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
 
@@ -144,7 +145,6 @@ cc_library(
     name = "cusolver",
     srcs = ["cuda/lib/%{cusolver_lib}"],
     data = ["cuda/lib/%{cusolver_lib}"],
-    linkopts = ["-lgomp"],
     linkstatic = 1,
 )
 
@@ -220,7 +220,6 @@ cc_library(
     name = "cusparse",
     srcs = ["cuda/lib/%{cusparse_lib}"],
     data = ["cuda/lib/%{cusparse_lib}"],
-    linkopts = ["-lgomp"],
     linkstatic = 1,
 )
 
@@ -242,4 +241,29 @@ py_library(
     srcs = ["cuda/cuda_config.py"],
 )
 
+# Config setting whether TensorFlow is built with hermetic CUDA.
+alias(
+    name = "hermetic_cuda_tools",
+    actual = "@local_config_cuda//:is_cuda_enabled",
+)
+
+# Flag indicating if we should include hermetic CUDA libs.
+bool_flag(
+    name = "include_hermetic_cuda_libs",
+    build_setting_default = False,
+)
+
+config_setting(
+    name = "hermetic_cuda_libs",
+    flag_values = {":include_hermetic_cuda_libs": "True"},
+)
+
+selects.config_setting_group(
+    name = "hermetic_cuda_tools_and_libs",
+    match_all = [
+        ":hermetic_cuda_libs",
+        ":hermetic_cuda_tools"
+    ],
+)
+
 %{copy_rules}
diff --git a/third_party/gpus/cuda/cuda_cccl.BUILD b/third_party/gpus/cuda/cuda_cccl.BUILD
new file mode 100644
index 00000000000000..9823f1d871ed53
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_cccl.BUILD
@@ -0,0 +1,12 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_cublas.BUILD.tpl b/third_party/gpus/cuda/cuda_cublas.BUILD.tpl
new file mode 100644
index 00000000000000..d5766c971a50ff
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_cublas.BUILD.tpl
@@ -0,0 +1,33 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cublas",
+    hdrs = [":headers"],
+    shared_library = "lib/libcublas.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cublasLt",
+    hdrs = [":headers"],
+    shared_library = "lib/libcublasLt.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = [
+      "include/cublas.h", 
+      "include/cublas_v2.h", 
+      "include/cublas_api.h", 
+      "include/cublasLt.h"
+    ],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_cudart.BUILD.tpl b/third_party/gpus/cuda/cuda_cudart.BUILD.tpl
new file mode 100644
index 00000000000000..08655e7819156c
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_cudart.BUILD.tpl
@@ -0,0 +1,34 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+filegroup(
+    name = "static",
+    srcs = ["lib/libcudart_static.a"],
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
+
+cc_import(
+    name = "cuda_driver",
+    shared_library = "lib/stubs/libcuda.so",
+)
+
+cc_import(
+    name = "cudart",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudart.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_cudnn.BUILD.tpl b/third_party/gpus/cuda/cuda_cudnn.BUILD.tpl
new file mode 100644
index 00000000000000..98da6e69cbe644
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_cudnn.BUILD.tpl
@@ -0,0 +1,65 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cudnn",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_ops_infer",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_ops_infer.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_cnn_infer",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_cnn_infer.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_ops_train",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_ops_train.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_cnn_train",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_cnn_train.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_adv_infer",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_adv_infer.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_adv_train",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_adv_train.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cudnn",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_cufft.BUILD.tpl b/third_party/gpus/cuda/cuda_cufft.BUILD.tpl
new file mode 100644
index 00000000000000..6836814dc9b622
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_cufft.BUILD.tpl
@@ -0,0 +1,23 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cufft",
+    hdrs = [":headers"],
+    shared_library = "lib/libcufft.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_cupti.BUILD.tpl b/third_party/gpus/cuda/cuda_cupti.BUILD.tpl
new file mode 100644
index 00000000000000..772386d723649f
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_cupti.BUILD.tpl
@@ -0,0 +1,23 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cupti",
+    hdrs = [":headers"],
+    shared_library = "lib/libcupti.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/extras/CUPTI/include",
+    includes = ["include/"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_curand.BUILD.tpl b/third_party/gpus/cuda/cuda_curand.BUILD.tpl
new file mode 100644
index 00000000000000..c98ded26f4b907
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_curand.BUILD.tpl
@@ -0,0 +1,23 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "curand",
+    hdrs = [":headers"],
+    shared_library = "lib/libcurand.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_cusolver.BUILD.tpl b/third_party/gpus/cuda/cuda_cusolver.BUILD.tpl
new file mode 100644
index 00000000000000..6a5f9d9737cfe2
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_cusolver.BUILD.tpl
@@ -0,0 +1,25 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cusolver",
+    hdrs = [":headers"],
+    shared_library = "lib/libcusolver.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = [
+      "include/cusolver_common.h", 
+      "include/cusolverDn.h", 
+      "include/cusolverSp.h"
+    ],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_cusparse.BUILD.tpl b/third_party/gpus/cuda/cuda_cusparse.BUILD.tpl
new file mode 100644
index 00000000000000..ad5c2b5f0c45c1
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_cusparse.BUILD.tpl
@@ -0,0 +1,23 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cusparse",
+    hdrs = [":headers"],
+    shared_library = "lib/libcusparse.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_nccl.BUILD b/third_party/gpus/cuda/cuda_nccl.BUILD
new file mode 100644
index 00000000000000..440b31c5cb616e
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_nccl.BUILD
@@ -0,0 +1,7 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+cc_import(
+    name = "nccl",
+    shared_library = "lib/libnccl.so.2",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/gpus/cuda/cuda_nvcc.BUILD b/third_party/gpus/cuda/cuda_nvcc.BUILD
new file mode 100644
index 00000000000000..6cdaca5cc902a0
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_nvcc.BUILD
@@ -0,0 +1,73 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "bin/nvcc",
+])
+
+filegroup(
+    name = "nvvm",
+    srcs = [
+        "nvvm/libdevice/libdevice.10.bc",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "nvlink",
+    srcs = [
+        "bin/nvlink",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "fatbinary",
+    srcs = [
+        "bin/fatbinary",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "bin2c",
+    srcs = [
+        "bin/bin2c",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "ptxas",
+    srcs = [
+        "bin/ptxas",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "bin",
+    srcs = glob([
+        "bin/**",
+        "nvvm/bin/**",
+    ]),
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "link_stub",
+    srcs = [
+        "bin/crt/link.stub",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_nvjitlink.BUILD.tpl b/third_party/gpus/cuda/cuda_nvjitlink.BUILD.tpl
new file mode 100644
index 00000000000000..6729b7cd1df9c4
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_nvjitlink.BUILD.tpl
@@ -0,0 +1,23 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "nvjitlink",
+    hdrs = [":headers"],
+    shared_library = "lib/libnvJitLink.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_nvml.BUILD b/third_party/gpus/cuda/cuda_nvml.BUILD
new file mode 100644
index 00000000000000..40b97e671cf7de
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_nvml.BUILD
@@ -0,0 +1,12 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/nvml/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda/cuda_nvprune.BUILD b/third_party/gpus/cuda/cuda_nvprune.BUILD
new file mode 100644
index 00000000000000..986ef0c8f76166
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_nvprune.BUILD
@@ -0,0 +1,9 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+filegroup(
+    name = "nvprune",
+    srcs = [
+        "bin/nvprune",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/gpus/cuda/cuda_nvtx.BUILD b/third_party/gpus/cuda/cuda_nvtx.BUILD
new file mode 100644
index 00000000000000..9823f1d871ed53
--- /dev/null
+++ b/third_party/gpus/cuda/cuda_nvtx.BUILD
@@ -0,0 +1,12 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index fefbf081c87e1c..b8aad7ed4994ee 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -53,6 +53,11 @@ load(
     "realpath",
     "which",
 )
+load(
+    ":compiler_common_tools.bzl",
+    "get_cxx_inc_directories",
+    "to_list_of_strings",
+)
 
 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
 _GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
@@ -67,20 +72,6 @@ _TF_CUDA_CONFIG_REPO = "TF_CUDA_CONFIG_REPO"
 _TF_DOWNLOAD_CLANG = "TF_DOWNLOAD_CLANG"
 _PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
 
-def to_list_of_strings(elements):
-    """Convert the list of ["a", "b", "c"] into '"a", "b", "c"'.
-
-    This is to be used to put a list of strings into the bzl file templates
-    so it gets interpreted as list of strings in Starlark.
-
-    Args:
-      elements: list of string elements
-
-    Returns:
-      single string of elements wrapped in quotes separated by a comma."""
-    quoted_strings = ["\"" + element + "\"" for element in elements]
-    return ", ".join(quoted_strings)
-
 def verify_build_defines(params):
     """Verify all variables that crosstool/BUILD.tpl expects are substituted.
 
@@ -238,156 +229,6 @@ def find_cc(repository_ctx, use_cuda_clang):
               " environment variable").format(target_cc_name, cc_path_envvar))
     return cc
 
-_INC_DIR_MARKER_BEGIN = "#include <...>"
-
-# OSX add " (framework directory)" at the end of line, strip it.
-_OSX_FRAMEWORK_SUFFIX = " (framework directory)"
-_OSX_FRAMEWORK_SUFFIX_LEN = len(_OSX_FRAMEWORK_SUFFIX)
-
-def _cxx_inc_convert(path):
-    """Convert path returned by cc -E xc++ in a complete path."""
-    path = path.strip()
-    if path.endswith(_OSX_FRAMEWORK_SUFFIX):
-        path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
-    return path
-
-def _normalize_include_path(repository_ctx, path):
-    """Normalizes include paths before writing them to the crosstool.
-
-      If path points inside the 'crosstool' folder of the repository, a relative
-      path is returned.
-      If path points outside the 'crosstool' folder, an absolute path is returned.
-      """
-    path = str(repository_ctx.path(path))
-    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
-
-    if path.startswith(crosstool_folder):
-        # We drop the path to "$REPO/crosstool" and a trailing path separator.
-        return path[len(crosstool_folder) + 1:]
-    return path
-
-def _is_compiler_option_supported(repository_ctx, cc, option):
-    """Checks that `option` is supported by the C compiler. Doesn't %-escape the option."""
-    result = repository_ctx.execute([
-        cc,
-        option,
-        "-o",
-        "/dev/null",
-        "-c",
-        str(repository_ctx.path("tools/cpp/empty.cc")),
-    ])
-    return result.stderr.find(option) == -1
-
-def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp, tf_sysroot):
-    """Compute the list of default C or C++ include directories."""
-    if lang_is_cpp:
-        lang = "c++"
-    else:
-        lang = "c"
-    sysroot = []
-    if tf_sysroot:
-        sysroot += ["--sysroot", tf_sysroot]
-    result = raw_exec(repository_ctx, [cc, "-E", "-x" + lang, "-", "-v"] +
-                                      sysroot)
-    stderr = err_out(result)
-    index1 = stderr.find(_INC_DIR_MARKER_BEGIN)
-    if index1 == -1:
-        return []
-    index1 = stderr.find("\n", index1)
-    if index1 == -1:
-        return []
-    index2 = stderr.rfind("\n ")
-    if index2 == -1 or index2 < index1:
-        return []
-    index2 = stderr.find("\n", index2 + 1)
-    if index2 == -1:
-        inc_dirs = stderr[index1 + 1:]
-    else:
-        inc_dirs = stderr[index1 + 1:index2].strip()
-
-    print_resource_dir_supported = _is_compiler_option_supported(
-        repository_ctx,
-        cc,
-        "-print-resource-dir",
-    )
-
-    if print_resource_dir_supported:
-        resource_dir = repository_ctx.execute(
-            [cc, "-print-resource-dir"],
-        ).stdout.strip() + "/share"
-        inc_dirs += "\n" + resource_dir
-
-    compiler_includes = [
-        _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
-        for p in inc_dirs.split("\n")
-    ]
-
-    # The compiler might be on a symlink, e.g. /symlink -> /opt/gcc
-    # The above keeps only the resolved paths to the default includes (e.g. /opt/gcc/include/c++/11)
-    # but Bazel might encounter either (usually reported by the compiler)
-    # especially when a compiler wrapper (e.g. ccache) is used.
-    # So we need to also include paths where symlinks are not resolved.
-
-    # Try to find real path to CC installation to "see through" compiler wrappers
-    # GCC has the path to g++
-    index1 = result.stderr.find("COLLECT_GCC=")
-    if index1 != -1:
-        index1 = result.stderr.find("=", index1)
-        index2 = result.stderr.find("\n", index1)
-        cc_topdir = repository_ctx.path(result.stderr[index1 + 1:index2]).dirname.dirname
-    else:
-        # Clang has the directory
-        index1 = result.stderr.find("InstalledDir: ")
-        if index1 != -1:
-            index1 = result.stderr.find(" ", index1)
-            index2 = result.stderr.find("\n", index1)
-            cc_topdir = repository_ctx.path(result.stderr[index1 + 1:index2]).dirname
-        else:
-            # Fallback to the CC path
-            cc_topdir = repository_ctx.path(cc).dirname.dirname
-
-    # We now have the compiler installation prefix, e.g. /symlink/gcc
-    # And the resolved installation prefix, e.g. /opt/gcc
-    cc_topdir_resolved = str(realpath(repository_ctx, cc_topdir)).strip()
-    cc_topdir = str(cc_topdir).strip()
-
-    # If there is (any!) symlink involved we add paths where the unresolved installation prefix is kept.
-    # e.g. [/opt/gcc/include/c++/11, /opt/gcc/lib/gcc/x86_64-linux-gnu/11/include, /other/path]
-    # adds [/symlink/include/c++/11, /symlink/lib/gcc/x86_64-linux-gnu/11/include]
-    if cc_topdir_resolved != cc_topdir:
-        unresolved_compiler_includes = [
-            cc_topdir + inc[len(cc_topdir_resolved):]
-            for inc in compiler_includes
-            if inc.startswith(cc_topdir_resolved)
-        ]
-        compiler_includes = compiler_includes + unresolved_compiler_includes
-    return compiler_includes
-
-def get_cxx_inc_directories(repository_ctx, cc, tf_sysroot):
-    """Compute the list of default C and C++ include directories."""
-
-    # For some reason `clang -xc` sometimes returns include paths that are
-    # different from the ones from `clang -xc++`. (Symlink and a dir)
-    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
-    includes_cpp = _get_cxx_inc_directories_impl(
-        repository_ctx,
-        cc,
-        True,
-        tf_sysroot,
-    )
-    includes_c = _get_cxx_inc_directories_impl(
-        repository_ctx,
-        cc,
-        False,
-        tf_sysroot,
-    )
-
-    return includes_cpp + [
-        inc
-        for inc in includes_c
-        if inc not in includes_cpp
-    ]
-
 def auto_configure_fail(msg):
     """Output failure message when cuda configuration fails."""
     red = "\033[0;31m"
@@ -1293,6 +1134,7 @@ def _create_local_cuda_repository(repository_ctx):
 
     cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
     cuda_defines["%{unfiltered_compile_flags}"] = ""
+    cuda_defines["%{cuda_nvcc_files}"] = "[]"
     if is_cuda_clang and not is_nvcc_and_clang:
         cuda_defines["%{host_compiler_path}"] = str(cc)
         cuda_defines["%{host_compiler_warnings}"] = """
diff --git a/third_party/gpus/hermetic_cuda_configure.bzl b/third_party/gpus/hermetic_cuda_configure.bzl
new file mode 100644
index 00000000000000..5d16aa6f76a1f4
--- /dev/null
+++ b/third_party/gpus/hermetic_cuda_configure.bzl
@@ -0,0 +1,570 @@
+"""Repository rule for hermetic CUDA autoconfiguration.
+
+`hermetic_cuda_configure` depends on the following environment variables:
+
+  * `TF_NEED_CUDA`: Whether to enable building with CUDA.
+  * `TF_NVCC_CLANG`: Whether to use clang for C++ and NVCC for Cuda compilation.
+  * `CLANG_CUDA_COMPILER_PATH`: The clang compiler path that will be used for
+    both host and device code compilation.
+  * `TF_SYSROOT`: The sysroot to use when compiling.
+  * `TF_CUDA_VERSION`: The version of the CUDA toolkit (mandatory).
+  * `TF_CUDA_COMPUTE_CAPABILITIES`: The CUDA compute capabilities. Default is
+    `3.5,5.2`.
+  * `PYTHON_BIN_PATH`: The python binary path
+"""
+
+load(
+    "//third_party/remote_config:common.bzl",
+    "get_cpu_value",
+    "get_host_environ",
+    "which",
+)
+load(
+    ":compiler_common_tools.bzl",
+    "get_cxx_inc_directories",
+    "to_list_of_strings",
+)
+
+def _find_cc(repository_ctx):
+    """Find the C++ compiler."""
+    cc_path_envvar = _CLANG_CUDA_COMPILER_PATH
+    cc_name = "clang"
+
+    cc_name_from_env = get_host_environ(repository_ctx, cc_path_envvar)
+    if cc_name_from_env:
+        cc_name = cc_name_from_env
+    if cc_name.startswith("/"):
+        # Return the absolute path.
+        return cc_name
+    cc = which(repository_ctx, cc_name)
+    if cc == None:
+        fail(("Cannot find {}, either correct your path or set the {}" +
+              " environment variable").format(cc_name, cc_path_envvar))
+    return cc
+
+def _auto_configure_fail(msg):
+    """Output failure message when cuda configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg))
+
+def _lib_name(base_name, cpu_value, version = None, static = False):
+    """Constructs the platform-specific name of a library.
+
+    Args:
+    base_name: The name of the library, such as "cudart"
+    cpu_value: The name of the host operating system.
+    version: The version of the library.
+    static: True the library is static or False if it is a shared object.
+
+    Returns:
+    The platform-specific name of the library.
+    """
+    version = "" if not version else "." + version
+    if cpu_value in ("Linux"):
+        if static:
+            return "lib%s.a" % base_name
+        return "lib%s.so%s" % (base_name, version)
+    elif cpu_value == "Windows":
+        return "%s.lib" % base_name
+    elif cpu_value == "Darwin":
+        if static:
+            return "lib%s.a" % base_name
+        return "lib%s%s.dylib" % (base_name, version)
+    else:
+        _auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
+
+def _verify_build_defines(params):
+    """Verify all variables that crosstool/BUILD.tpl expects are substituted.
+
+    Args:
+      params: dict of variables that will be passed to the BUILD.tpl template.
+    """
+    missing = []
+    for param in [
+        "cxx_builtin_include_directories",
+        "extra_no_canonical_prefixes_flags",
+        "host_compiler_path",
+        "host_compiler_prefix",
+        "host_compiler_warnings",
+        "linker_bin_path",
+        "compiler_deps",
+        "msvc_cl_path",
+        "msvc_env_include",
+        "msvc_env_lib",
+        "msvc_env_path",
+        "msvc_env_tmp",
+        "msvc_lib_path",
+        "msvc_link_path",
+        "msvc_ml_path",
+        "unfiltered_compile_flags",
+        "win_compiler_deps",
+    ]:
+        if ("%{" + param + "}") not in params:
+            missing.append(param)
+
+    if missing:
+        _auto_configure_fail(
+            "BUILD.tpl template is missing these variables: " +
+            str(missing) +
+            ".\nWe only got: " +
+            str(params) +
+            ".",
+        )
+
+def get_cuda_version(repository_ctx):
+    return get_host_environ(repository_ctx, _TF_CUDA_VERSION)
+
+def enable_cuda(repository_ctx):
+    """Returns whether to build with CUDA support."""
+    return int(get_host_environ(repository_ctx, TF_NEED_CUDA, False))
+
+def _flag_enabled(repository_ctx, flag_name):
+    return get_host_environ(repository_ctx, flag_name) == "1"
+
+def _use_nvcc_and_clang(repository_ctx):
+    # Returns the flag if we need to use clang for C++ and NVCC for Cuda.
+    return _flag_enabled(repository_ctx, _TF_NVCC_CLANG)
+
+def _tf_sysroot(repository_ctx):
+    return get_host_environ(repository_ctx, _TF_SYSROOT, "")
+
+def _py_tmpl_dict(d):
+    return {"%{cuda_config}": str(d)}
+
+def _cudart_static_linkopt(cpu_value):
+    """Returns additional platform-specific linkopts for cudart."""
+    return "\"\"," if cpu_value == "Darwin" else "\"-lrt\","
+
+def _compute_capabilities(repository_ctx):
+    """Returns a list of strings representing cuda compute capabilities.
+
+    Args:
+      repository_ctx: the repo rule's context.
+
+    Returns:
+      list of cuda architectures to compile for. 'compute_xy' refers to
+      both PTX and SASS, 'sm_xy' refers to SASS only.
+    """
+    capabilities = get_host_environ(
+        repository_ctx,
+        _TF_CUDA_COMPUTE_CAPABILITIES,
+        "compute_35,compute_52",
+    ).split(",")
+
+    # Map old 'x.y' capabilities to 'compute_xy'.
+    if len(capabilities) > 0 and all([len(x.split(".")) == 2 for x in capabilities]):
+        # If all capabilities are in 'x.y' format, only include PTX for the
+        # highest capability.
+        cc_list = sorted([x.replace(".", "") for x in capabilities])
+        capabilities = ["sm_%s" % x for x in cc_list[:-1]] + ["compute_%s" % cc_list[-1]]
+    for i, capability in enumerate(capabilities):
+        parts = capability.split(".")
+        if len(parts) != 2:
+            continue
+        capabilities[i] = "compute_%s%s" % (parts[0], parts[1])
+
+    # Make list unique
+    capabilities = dict(zip(capabilities, capabilities)).keys()
+
+    # Validate capabilities.
+    for capability in capabilities:
+        if not capability.startswith(("compute_", "sm_")):
+            _auto_configure_fail("Invalid compute capability: %s" % capability)
+        for prefix in ["compute_", "sm_"]:
+            if not capability.startswith(prefix):
+                continue
+            if len(capability) == len(prefix) + 2 and capability[-2:].isdigit():
+                continue
+            if len(capability) == len(prefix) + 3 and capability.endswith("90a"):
+                continue
+            _auto_configure_fail("Invalid compute capability: %s" % capability)
+
+    return capabilities
+
+def _compute_cuda_extra_copts(compute_capabilities):
+    copts = ["--no-cuda-include-ptx=all"]
+    for capability in compute_capabilities:
+        if capability.startswith("compute_"):
+            capability = capability.replace("compute_", "sm_")
+            copts.append("--cuda-include-ptx=%s" % capability)
+        copts.append("--cuda-gpu-arch=%s" % capability)
+
+    return str(copts)
+
+def _get_cuda_config(repository_ctx):
+    """Detects and returns information about the CUDA installation on the system.
+
+      Args:
+        repository_ctx: The repository context.
+
+      Returns:
+        A struct containing the following fields:
+          cuda_version: The version of CUDA on the system.
+          cudart_version: The CUDA runtime version on the system.
+          cudnn_version: The version of cuDNN on the system.
+          compute_capabilities: A list of the system's CUDA compute capabilities.
+          cpu_value: The name of the host operating system.
+      """
+
+    return struct(
+        cuda_version = get_cuda_version(repository_ctx),
+        cupti_version = repository_ctx.read(repository_ctx.attr.cupti_version),
+        cudart_version = repository_ctx.read(repository_ctx.attr.cudart_version),
+        cublas_version = repository_ctx.read(repository_ctx.attr.cublas_version),
+        cusolver_version = repository_ctx.read(repository_ctx.attr.cusolver_version),
+        curand_version = repository_ctx.read(repository_ctx.attr.curand_version),
+        cufft_version = repository_ctx.read(repository_ctx.attr.cufft_version),
+        cusparse_version = repository_ctx.read(repository_ctx.attr.cusparse_version),
+        cudnn_version = repository_ctx.read(repository_ctx.attr.cudnn_version),
+        compute_capabilities = _compute_capabilities(repository_ctx),
+        cpu_value = get_cpu_value(repository_ctx),
+    )
+
+_DUMMY_CROSSTOOL_BZL_FILE = """
+def error_gpu_disabled():
+  fail("ERROR: Building with --config=cuda but TensorFlow is not configured " +
+       "to build with GPU support. Please re-run ./configure and enter 'Y' " +
+       "at the prompt to build with GPU support.")
+
+  native.genrule(
+      name = "error_gen_crosstool",
+      outs = ["CROSSTOOL"],
+      cmd = "echo 'Should not be run.' && exit 1",
+  )
+
+  native.filegroup(
+      name = "crosstool",
+      srcs = [":CROSSTOOL"],
+      output_licenses = ["unencumbered"],
+  )
+"""
+
+_DUMMY_CROSSTOOL_BUILD_FILE = """
+load("//crosstool:error_gpu_disabled.bzl", "error_gpu_disabled")
+
+error_gpu_disabled()
+"""
+
+def _create_dummy_repository(repository_ctx):
+    cpu_value = get_cpu_value(repository_ctx)
+
+    # Set up BUILD file for cuda/.
+    repository_ctx.template(
+        "cuda/build_defs.bzl",
+        repository_ctx.attr.build_defs_tpl,
+        {
+            "%{cuda_is_configured}": "False",
+            "%{cuda_extra_copts}": "[]",
+            "%{cuda_gpu_architectures}": "[]",
+            "%{cuda_version}": "0.0",
+        },
+    )
+
+    repository_ctx.template(
+        "cuda/BUILD",
+        repository_ctx.attr.dummy_cuda_build_tpl,
+        {
+            "%{cuda_driver_lib}": _lib_name("cuda", cpu_value),
+            "%{cudart_static_lib}": _lib_name(
+                "cudart_static",
+                cpu_value,
+                static = True,
+            ),
+            "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
+            "%{cudart_lib}": _lib_name("cudart", cpu_value),
+            "%{cublas_lib}": _lib_name("cublas", cpu_value),
+            "%{cublasLt_lib}": _lib_name("cublasLt", cpu_value),
+            "%{cusolver_lib}": _lib_name("cusolver", cpu_value),
+            "%{cudnn_lib}": _lib_name("cudnn", cpu_value),
+            "%{cufft_lib}": _lib_name("cufft", cpu_value),
+            "%{curand_lib}": _lib_name("curand", cpu_value),
+            "%{cupti_lib}": _lib_name("cupti", cpu_value),
+            "%{cusparse_lib}": _lib_name("cusparse", cpu_value),
+            "%{cub_actual}": ":cuda_headers",
+            "%{copy_rules}": """
+filegroup(name="cuda-include")
+filegroup(name="cublas-include")
+filegroup(name="cusolver-include")
+filegroup(name="cufft-include")
+filegroup(name="cusparse-include")
+filegroup(name="curand-include")
+filegroup(name="cudnn-include")
+""",
+        },
+    )
+
+    # Create dummy files for the CUDA toolkit since they are still required by
+    # tensorflow/tsl/platform/default/build_config:cuda.
+    repository_ctx.file("cuda/cuda/include/cuda.h")
+    repository_ctx.file("cuda/cuda/include/cublas.h")
+    repository_ctx.file("cuda/cuda/include/cudnn.h")
+    repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h")
+    repository_ctx.file("cuda/cuda/nvml/include/nvml.h")
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cuda", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudart", cpu_value))
+    repository_ctx.file(
+        "cuda/cuda/lib/%s" % _lib_name("cudart_static", cpu_value),
+    )
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cublas", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cublasLt", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cusolver", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudnn", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("curand", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cufft", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cupti", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cusparse", cpu_value))
+
+    # Set up cuda_config.h, which is used by
+    # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
+    repository_ctx.template(
+        "cuda/cuda/cuda_config.h",
+        repository_ctx.attr.cuda_config_tpl,
+        {
+            "%{cuda_version}": "",
+            "%{cudart_version}": "",
+            "%{cupti_version}": "",
+            "%{cublas_version}": "",
+            "%{cusolver_version}": "",
+            "%{curand_version}": "",
+            "%{cufft_version}": "",
+            "%{cusparse_version}": "",
+            "%{cudnn_version}": "",
+            "%{cuda_toolkit_path}": "",
+            "%{cuda_compute_capabilities}": "",
+        },
+    )
+
+    # Set up cuda_config.py, which is used by gen_build_info to provide
+    # static build environment info to the API
+    repository_ctx.template(
+        "cuda/cuda/cuda_config.py",
+        repository_ctx.attr.cuda_config_py_tpl,
+        _py_tmpl_dict({}),
+    )
+
+    # If cuda_configure is not configured to build with GPU support, and the user
+    # attempts to build with --config=cuda, add a dummy build rule to intercept
+    # this and fail with an actionable error message.
+    repository_ctx.file(
+        "crosstool/error_gpu_disabled.bzl",
+        _DUMMY_CROSSTOOL_BZL_FILE,
+    )
+    repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
+
+def _create_local_cuda_repository(repository_ctx):
+    """Creates the repository containing files set up to build with CUDA."""
+    cuda_config = _get_cuda_config(repository_ctx)
+
+    # Set up BUILD file for cuda/
+    repository_ctx.template(
+        "cuda/build_defs.bzl",
+        repository_ctx.attr.build_defs_tpl,
+        {
+            "%{cuda_is_configured}": "True",
+            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
+                cuda_config.compute_capabilities,
+            ),
+            "%{cuda_gpu_architectures}": str(cuda_config.compute_capabilities),
+            "%{cuda_version}": cuda_config.cuda_version,
+        },
+    )
+
+    repository_ctx.template(
+        "cuda/BUILD",
+        repository_ctx.attr.cuda_build_tpl,
+        {
+            "%{cudart_static_linkopt}": _cudart_static_linkopt(cuda_config.cpu_value),
+            "%{cub_actual}": ":cuda_headers",
+        },
+    )
+
+    is_nvcc_and_clang = _use_nvcc_and_clang(repository_ctx)
+    tf_sysroot = _tf_sysroot(repository_ctx)
+
+    # Set up crosstool/
+    cc = _find_cc(repository_ctx)
+    host_compiler_includes = get_cxx_inc_directories(
+        repository_ctx,
+        cc,
+        tf_sysroot,
+    )
+
+    cuda_defines = {}
+
+    # We do not support hermetic CUDA on Windows.
+    # This ensures the CROSSTOOL file parser is happy.
+    cuda_defines.update({
+        "%{msvc_env_tmp}": "msvc_not_used",
+        "%{msvc_env_path}": "msvc_not_used",
+        "%{msvc_env_include}": "msvc_not_used",
+        "%{msvc_env_lib}": "msvc_not_used",
+        "%{msvc_cl_path}": "msvc_not_used",
+        "%{msvc_ml_path}": "msvc_not_used",
+        "%{msvc_link_path}": "msvc_not_used",
+        "%{msvc_lib_path}": "msvc_not_used",
+        "%{win_compiler_deps}": ":empty",
+    })
+
+    cuda_defines["%{builtin_sysroot}"] = tf_sysroot
+    cuda_defines["%{cuda_toolkit_path}"] = repository_ctx.attr.nvcc_binary.workspace_root
+    cuda_defines["%{compiler}"] = "clang"
+    cuda_defines["%{host_compiler_prefix}"] = "/usr/bin"
+    cuda_defines["%{linker_bin_path}"] = ""
+    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
+    cuda_defines["%{unfiltered_compile_flags}"] = ""
+    cuda_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(host_compiler_includes)
+    cuda_defines["%{cuda_nvcc_files}"] = "if_cuda([\"@{nvcc_archive}//:bin\", \"@{nvcc_archive}//:nvvm\"])".format(nvcc_archive = repository_ctx.attr.nvcc_binary.repo_name)
+
+    if not is_nvcc_and_clang:
+        cuda_defines["%{host_compiler_path}"] = str(cc)
+        cuda_defines["%{host_compiler_warnings}"] = """
+        # Some parts of the codebase set -Werror and hit this warning, so
+        # switch it off for now.
+        "-Wno-invalid-partial-specialization"
+    """
+        cuda_defines["%{compiler_deps}"] = ":cuda_nvcc_files"
+        repository_ctx.file(
+            "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
+            "",
+        )
+    else:
+        cuda_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
+        cuda_defines["%{host_compiler_warnings}"] = ""
+
+        nvcc_relative_path = "%s/%s" % (repository_ctx.attr.nvcc_binary.workspace_root, repository_ctx.attr.nvcc_binary.name)
+        cuda_defines["%{compiler_deps}"] = ":crosstool_wrapper_driver_is_not_gcc"
+
+        wrapper_defines = {
+            "%{cpu_compiler}": str(cc),
+            "%{cuda_version}": cuda_config.cuda_version,
+            "%{nvcc_path}": nvcc_relative_path,
+            "%{host_compiler_path}": str(cc),
+            "%{use_clang_compiler}": "True",
+        }
+        repository_ctx.template(
+            "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
+            repository_ctx.attr.crosstool_wrapper_driver_is_not_gcc_tpl,
+            wrapper_defines,
+        )
+
+    _verify_build_defines(cuda_defines)
+
+    # Only expand template variables in the BUILD file
+    repository_ctx.template(
+        "crosstool/BUILD",
+        repository_ctx.attr.crosstool_build_tpl,
+        cuda_defines,
+    )
+
+    # No templating of cc_toolchain_config - use attributes and templatize the
+    # BUILD file.
+    repository_ctx.template(
+        "crosstool/cc_toolchain_config.bzl",
+        repository_ctx.attr.cc_toolchain_config_tpl,
+        {},
+    )
+
+    # Set up cuda_config.h, which is used by
+    # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
+    repository_ctx.template(
+        "cuda/cuda/cuda_config.h",
+        repository_ctx.attr.cuda_config_tpl,
+        {
+            "%{cuda_version}": cuda_config.cuda_version,
+            "%{cudart_version}": cuda_config.cudart_version,
+            "%{cupti_version}": cuda_config.cupti_version,
+            "%{cublas_version}": cuda_config.cublas_version,
+            "%{cusolver_version}": cuda_config.cusolver_version,
+            "%{curand_version}": cuda_config.curand_version,
+            "%{cufft_version}": cuda_config.cufft_version,
+            "%{cusparse_version}": cuda_config.cusparse_version,
+            "%{cudnn_version}": cuda_config.cudnn_version,
+            "%{cuda_toolkit_path}": "",
+            "%{cuda_compute_capabilities}": ", ".join([
+                cc.split("_")[1]
+                for cc in cuda_config.compute_capabilities
+            ]),
+        },
+    )
+
+    # Set up cuda_config.py, which is used by gen_build_info to provide
+    # static build environment info to the API
+    repository_ctx.template(
+        "cuda/cuda/cuda_config.py",
+        repository_ctx.attr.cuda_config_py_tpl,
+        _py_tmpl_dict({
+            "cuda_version": cuda_config.cuda_version,
+            "cudnn_version": cuda_config.cudnn_version,
+            "cuda_compute_capabilities": cuda_config.compute_capabilities,
+            "cpu_compiler": str(cc),
+        }),
+    )
+
+def _cuda_autoconf_impl(repository_ctx):
+    """Implementation of the cuda_autoconf repository rule."""
+    build_file = repository_ctx.attr.local_config_cuda_build_file
+
+    if not enable_cuda(repository_ctx):
+        _create_dummy_repository(repository_ctx)
+    else:
+        _create_local_cuda_repository(repository_ctx)
+
+    repository_ctx.symlink(build_file, "BUILD")
+
+_CLANG_CUDA_COMPILER_PATH = "CLANG_CUDA_COMPILER_PATH"
+_PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
+_TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
+_TF_CUDA_VERSION = "TF_CUDA_VERSION"
+TF_NEED_CUDA = "TF_NEED_CUDA"
+_TF_NVCC_CLANG = "TF_NVCC_CLANG"
+_TF_SYSROOT = "TF_SYSROOT"
+
+_ENVIRONS = [
+    _CLANG_CUDA_COMPILER_PATH,
+    TF_NEED_CUDA,
+    _TF_NVCC_CLANG,
+    _TF_CUDA_VERSION,
+    _TF_CUDA_COMPUTE_CAPABILITIES,
+    _TF_SYSROOT,
+    _PYTHON_BIN_PATH,
+    "TMP",
+    "TMPDIR",
+]
+
+hermetic_cuda_configure = repository_rule(
+    implementation = _cuda_autoconf_impl,
+    environ = _ENVIRONS,
+    attrs = {
+        "environ": attr.string_dict(),
+        "cublas_version": attr.label(default = Label("@cuda_cublas//:version.txt")),
+        "cudart_version": attr.label(default = Label("@cuda_cudart//:version.txt")),
+        "cudnn_version": attr.label(default = Label("@cuda_cudnn//:version.txt")),
+        "cufft_version": attr.label(default = Label("@cuda_cufft//:version.txt")),
+        "cupti_version": attr.label(default = Label("@cuda_cupti//:version.txt")),
+        "curand_version": attr.label(default = Label("@cuda_curand//:version.txt")),
+        "cusolver_version": attr.label(default = Label("@cuda_cusolver//:version.txt")),
+        "cusparse_version": attr.label(default = Label("@cuda_cusparse//:version.txt")),
+        "nvcc_binary": attr.label(default = Label("@cuda_nvcc//:bin/nvcc")),
+        "local_config_cuda_build_file": attr.label(default = Label("//third_party/gpus:local_config_cuda.BUILD")),
+        "build_defs_tpl": attr.label(default = Label("//third_party/gpus/cuda:build_defs.bzl.tpl")),
+        "cuda_build_tpl": attr.label(default = Label("//third_party/gpus/cuda:BUILD.hermetic.tpl")),
+        "dummy_cuda_build_tpl": attr.label(default = Label("//third_party/gpus/cuda:BUILD.tpl")),
+        "cuda_config_tpl": attr.label(default = Label("//third_party/gpus/cuda:cuda_config.h.tpl")),
+        "cuda_config_py_tpl": attr.label(default = Label("//third_party/gpus/cuda:cuda_config.py.tpl")),
+        "crosstool_wrapper_driver_is_not_gcc_tpl": attr.label(default = Label("//third_party/gpus/crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl")),
+        "crosstool_build_tpl": attr.label(default = Label("//third_party/gpus/crosstool:BUILD.tpl")),
+        "cc_toolchain_config_tpl": attr.label(default = Label("//third_party/gpus/crosstool:cc_toolchain_config.bzl.tpl")),
+    },
+)
+"""Detects and configures the hermetic CUDA toolchain.
+
+Add the following to your WORKSPACE FILE:
+
+```python
+hermetic cuda_configure(name = "local_config_cuda")
+```
+
+Args:
+  name: A unique name for this workspace rule.
+"""
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 0fd4019fc5bb75..cf756b452e3950 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -22,12 +22,15 @@ load(
     "realpath",
     "which",
 )
+load(
+    ":compiler_common_tools.bzl",
+    "to_list_of_strings",
+)
 load(
     ":cuda_configure.bzl",
     "enable_cuda",
     "make_copy_dir_rule",
     "make_copy_files_rule",
-    "to_list_of_strings",
 )
 load(
     ":sycl_configure.bzl",
diff --git a/third_party/gpus/sycl_configure.bzl b/third_party/gpus/sycl_configure.bzl
index 05330b2fe53195..dd80694e7274f5 100644
--- a/third_party/gpus/sycl_configure.bzl
+++ b/third_party/gpus/sycl_configure.bzl
@@ -16,11 +16,14 @@ load(
     "realpath",
     "which",
 )
+load(
+    ":compiler_common_tools.bzl",
+    "to_list_of_strings",
+)
 load(
     ":cuda_configure.bzl",
     "make_copy_dir_rule",
     "make_copy_files_rule",
-    "to_list_of_strings",
 )
 
 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
diff --git a/third_party/nccl/build_defs.bzl.tpl b/third_party/nccl/build_defs.bzl.tpl
index 53a6d4e1e41890..a0930df34ecec8 100644
--- a/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/nccl/build_defs.bzl.tpl
@@ -5,7 +5,6 @@ load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
 
 # CUDA toolkit version as tuple (e.g. '(11, 1)').
 _cuda_version = %{cuda_version}
-_cuda_clang = %{cuda_clang}
 
 def _rdc_copts():
     """Returns copts for compiling relocatable device code."""
@@ -121,25 +120,25 @@ _device_link = rule(
         "gpu_archs": attr.string_list(),
         "nvlink_args": attr.string_list(),
         "_nvlink": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/nvlink"),
+            default = Label("%{nvlink_label}"),
             allow_single_file = True,
             executable = True,
             cfg = "host",
         ),
         "_fatbinary": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/fatbinary"),
+            default = Label("%{fatbinary_label}"),
             allow_single_file = True,
             executable = True,
             cfg = "host",
         ),
         "_bin2c": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/bin2c"),
+            default = Label("%{bin2c_label}"),
             allow_single_file = True,
             executable = True,
             cfg = "host",
         ),
         "_link_stub": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/crt/link.stub"),
+            default = Label("%{link_stub_label}"),
             allow_single_file = True,
         ),
     },
@@ -189,7 +188,7 @@ _prune_relocatable_code = rule(
         "input": attr.label(mandatory = True, allow_files = True),
         "gpu_archs": attr.string_list(),
         "_nvprune": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/nvprune"),
+            default = Label("%{nvprune_label}"),
             allow_single_file = True,
             executable = True,
             cfg = "host",
diff --git a/third_party/nccl/hermetic_nccl_configure.bzl b/third_party/nccl/hermetic_nccl_configure.bzl
new file mode 100644
index 00000000000000..b99cbcb08db58a
--- /dev/null
+++ b/third_party/nccl/hermetic_nccl_configure.bzl
@@ -0,0 +1,153 @@
+"""Repository rule for hermetic NCCL configuration.
+
+`hermetic_nccl_configure` depends on the following environment variables:
+
+  * `TF_NCCL_USE_STUB`: "1" if a NCCL stub that loads NCCL dynamically should
+    be used, "0" if NCCL should be linked in statically.
+
+"""
+
+load(
+    "//third_party/gpus:hermetic_cuda_configure.bzl",
+    "TF_NEED_CUDA",
+    "enable_cuda",
+    "get_cuda_version",
+)
+load(
+    "//third_party/remote_config:common.bzl",
+    "get_cpu_value",
+    "get_host_environ",
+)
+
+_TF_NCCL_USE_STUB = "TF_NCCL_USE_STUB"
+
+_NCCL_DUMMY_BUILD_CONTENT = """
+filegroup(
+  name = "LICENSE",
+  visibility = ["//visibility:public"],
+)
+
+cc_library(
+  name = "nccl",
+  visibility = ["//visibility:public"],
+)
+
+cc_library(
+  name = "nccl_config",
+  hdrs = ["nccl_config.h"],
+  include_prefix = "third_party/nccl",
+  visibility = ["//visibility:public"],
+)
+"""
+
+_NCCL_ARCHIVE_BUILD_CONTENT = """
+filegroup(
+  name = "LICENSE",
+  data = ["@nccl_archive//:LICENSE.txt"],
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl",
+  actual = "@nccl_archive//:nccl",
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl_config",
+  actual = "@nccl_archive//:nccl_config",
+  visibility = ["//visibility:public"],
+)
+"""
+
+_NCCL_ARCHIVE_STUB_BUILD_CONTENT = """
+alias(
+  name = "nccl_lib",
+  actual = "@cuda_nccl//:nccl_lib",
+)
+
+filegroup(
+  name = "LICENSE",
+  data = ["@nccl_archive//:LICENSE.txt"],
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl",
+  actual = "@nccl_archive//:nccl_via_stub",
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl_headers",
+  actual = "@nccl_archive//:nccl_headers",
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl_config",
+  actual = "@nccl_archive//:nccl_config",
+  visibility = ["//visibility:public"],
+)
+"""
+
+def _create_local_nccl_repository(repository_ctx):
+    cuda_version = get_cuda_version(repository_ctx)
+    if cuda_version == "12":
+        cuda_version = "12.3"
+    cuda_version = cuda_version.split(".")
+
+    # Alias to open source build from @nccl_archive.
+    if get_host_environ(repository_ctx, _TF_NCCL_USE_STUB, "0") == "0":
+        repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
+    else:
+        repository_ctx.file("BUILD", _NCCL_ARCHIVE_STUB_BUILD_CONTENT)
+
+    repository_ctx.template("generated_names.bzl", repository_ctx.attr.generated_names_tpl, {})
+    repository_ctx.template(
+        "build_defs.bzl",
+        repository_ctx.attr.build_defs_tpl,
+        {
+            "%{cuda_version}": "(%s, %s)" % tuple(cuda_version),
+            "%{nvlink_label}": "@cuda_nvcc//:nvlink",
+            "%{fatbinary_label}": "@cuda_nvcc//:fatbinary",
+            "%{bin2c_label}": "@cuda_nvcc//:bin2c",
+            "%{link_stub_label}": "@cuda_nvcc//:link_stub",
+            "%{nvprune_label}": "@cuda_nvprune//:nvprune",
+        },
+    )
+
+def _nccl_autoconf_impl(repository_ctx):
+    if (not enable_cuda(repository_ctx) or
+        get_cpu_value(repository_ctx) not in ("Linux", "FreeBSD")):
+        # Add a dummy build file to make bazel query happy.
+        repository_ctx.file("BUILD", _NCCL_DUMMY_BUILD_CONTENT)
+        repository_ctx.file("nccl_config.h", "#define TF_NCCL_VERSION \"\"")
+    else:
+        _create_local_nccl_repository(repository_ctx)
+
+_ENVIRONS = [
+    TF_NEED_CUDA,
+]
+
+hermetic_nccl_configure = repository_rule(
+    environ = _ENVIRONS,
+    implementation = _nccl_autoconf_impl,
+    attrs = {
+        "environ": attr.string_dict(),
+        "generated_names_tpl": attr.label(default = Label("//third_party/nccl:generated_names.bzl.tpl")),
+        "build_defs_tpl": attr.label(default = Label("//third_party/nccl:build_defs.bzl.tpl")),
+        "system_build_tpl": attr.label(default = Label("//third_party/nccl:system.BUILD.tpl")),
+    },
+)
+"""Downloads and configures the hermetic NCCL configuration.
+
+Add the following to your WORKSPACE FILE:
+
+```python
+hermetic_nccl_configure(name = "local_config_nccl")
+```
+
+Args:
+  name: A unique name for this workspace rule.
+"""
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index 22cf64d4771062..4da2513e03eb44 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -8,7 +8,6 @@
     files.
   * `TF_CUDA_PATHS`: The base paths to look for CUDA and cuDNN. Default is
     `/usr/local/cuda,usr/`.
-  * `TF_CUDA_CLANG`: "1" if using Clang, "0" if using NVCC.
   * `TF_NCCL_USE_STUB`: "1" if a NCCL stub that loads NCCL dynamically should
     be used, "0" if NCCL should be linked in statically.
 
@@ -33,7 +32,6 @@ _TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
 _TF_NCCL_VERSION = "TF_NCCL_VERSION"
 _TF_NEED_CUDA = "TF_NEED_CUDA"
 _TF_CUDA_PATHS = "TF_CUDA_PATHS"
-_TF_CUDA_CLANG = "TF_CUDA_CLANG"
 _TF_NCCL_USE_STUB = "TF_NCCL_USE_STUB"
 
 _DEFINE_NCCL_MAJOR = "#define NCCL_MAJOR"
@@ -129,7 +127,11 @@ def _create_local_nccl_repository(repository_ctx):
             _label("build_defs.bzl.tpl"),
             {
                 "%{cuda_version}": "(%s, %s)" % tuple(cuda_version),
-                "%{cuda_clang}": repr(get_host_environ(repository_ctx, _TF_CUDA_CLANG)),
+                "%{nvlink_label}": "@local_config_cuda//cuda:cuda/bin/nvlink",
+                "%{fatbinary_label}": "@local_config_cuda//cuda:cuda/bin/fatbinary",
+                "%{bin2c_label}": "@local_config_cuda//cuda:cuda/bin/bin2c",
+                "%{link_stub_label}": "@local_config_cuda//cuda:cuda/bin/crt/link.stub",
+                "%{nvprune_label}": "@local_config_cuda//cuda:cuda/bin/nvprune",
             },
         )
     else:
@@ -181,7 +183,6 @@ _ENVIRONS = [
     _TF_CUDA_COMPUTE_CAPABILITIES,
     _TF_NEED_CUDA,
     _TF_CUDA_PATHS,
-    _TF_CUDA_CLANG,
 ]
 
 remote_nccl_configure = repository_rule(
diff --git a/third_party/xla/.bazelrc b/third_party/xla/.bazelrc
index 02dec0349c4741..c17ae4494dc99c 100644
--- a/third_party/xla/.bazelrc
+++ b/third_party/xla/.bazelrc
@@ -226,12 +226,13 @@ build:cuda --repo_env TF_NEED_CUDA=1
 build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda --@local_config_cuda//:enable_cuda
 
+build:no_cuda_libs --@local_config_cuda//cuda:include_hermetic_cuda_libs=false
+
 # CUDA: This config refers to building CUDA op kernels with clang.
 build:cuda_clang --config=cuda
-# Enable TensorRT optimizations https://developer.nvidia.com/tensorrt
-build:cuda_clang --config=tensorrt
 build:cuda_clang --action_env=TF_CUDA_CLANG="1"
 build:cuda_clang --@local_config_cuda//:cuda_compiler=clang
+build:cuda_clang --copt=-Qunused-arguments
 # Select supported compute capabilities (supported graphics cards).
 # This is the same as the official TensorFlow builds.
 # See https://developer.nvidia.com/cuda-gpus#compute
@@ -244,12 +245,10 @@ build:cuda_clang --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_60,sm_70,sm_80,sm_8
 
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 build:cuda_clang_official --config=cuda_clang
-build:cuda_clang_official --action_env=TF_CUDA_VERSION="12"
-build:cuda_clang_official --action_env=TF_CUDNN_VERSION="8"
-build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.3"
+build:cuda_clang_official --action_env=TF_CUDA_VERSION="12.3"
+build:cuda_clang_official --action_env=TF_CUDNN_VERSION="8.9"
 build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
-build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 build:cuda_clang_official --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
 
 # Build with nvcc for CUDA and clang for host
@@ -545,10 +544,6 @@ build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
 # For Remote build execution -- GPU configuration
 build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
-build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.17-clang_config_cuda"
-build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.17-clang_config_tensorrt"
-build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.17-clang_config_nccl"
-test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
 build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
 build:rbe_linux_cuda_nvcc --config=nvcc_clang
@@ -633,7 +628,6 @@ build:release_cpu_linux_base --repo_env=BAZEL_COMPILER="/usr/lib/llvm-17/bin/cla
 # Test-related settings below this point.
 test:release_linux_base --build_tests_only --keep_going --test_output=errors --verbose_failures=true
 test:release_linux_base --local_test_jobs=HOST_CPUS
-test:release_linux_base --test_env=LD_LIBRARY_PATH
 # Give only the list of failed tests at the end of the log
 test:release_linux_base --test_summary=short
 
@@ -645,7 +639,6 @@ build:release_gpu_linux --config=release_cpu_linux
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 # Note that linux cpu and cuda builds share the same toolchain now.
 build:release_gpu_linux --config=cuda_clang_official
-test:release_gpu_linux --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 # Local test jobs has to be 4 because parallel_gpu_execute is fragile, I think
 test:release_gpu_linux --test_timeout=300,450,1200,3600 --local_test_jobs=4 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute
 
@@ -674,11 +667,8 @@ test:unsupported_cpu_linux --config=release_base
 build:unsupported_gpu_linux --config=cuda
 build:unsupported_gpu_linux --config=unsupported_cpu_linux
 build:unsupported_gpu_linux --action_env=TF_CUDA_VERSION="11"
-build:unsupported_gpu_linux --action_env=TF_CUDNN_VERSION="8"
+build:unsupported_gpu_linux --action_env=TF_CUDNN_VERSION="8.6"
 build:unsupported_gpu_linux --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_50,sm_60,sm_70,sm_75,compute_80"
-build:unsupported_gpu_linux --config=tensorrt
-build:unsupported_gpu_linux --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-11.2"
-build:unsupported_gpu_linux --action_env=LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.1/lib64:/usr/local/tensorrt/lib"
 build:unsupported_gpu_linux --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:unsupported_gpu_linux --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain
 
diff --git a/third_party/xla/build_tools/configure/configure.py b/third_party/xla/build_tools/configure/configure.py
index 663e4b8724280d..74d571909a2c57 100755
--- a/third_party/xla/build_tools/configure/configure.py
+++ b/third_party/xla/build_tools/configure/configure.py
@@ -27,11 +27,6 @@
   the clang in your path. If that isn't the correct clang, you can override like
   `./configure.py --backend=cpu --clang_path=<PATH_TO_YOUR_CLANG>`.
 
-NOTE(ddunleavy): Lots of these things should probably be outside of configure.py
-but are here because of complexity in `cuda_configure.bzl` and the TF bazelrc.
-Once XLA has it's own bazelrc, and cuda_configure.bzl is replaced or refactored,
-we can probably make this file smaller.
-
 TODO(ddunleavy): add more thorough validation.
 """
 import argparse
@@ -45,18 +40,10 @@
 import sys
 from typing import Optional
 
-_REQUIRED_CUDA_LIBRARIES = ["cublas", "cuda", "cudnn"]
+_REQUIRED_CUDA_LIBRARIES = ["cuda", "cudnn"]
 _DEFAULT_BUILD_AND_TEST_TAG_FILTERS = ("-no_oss",)
 # Assume we are being invoked from the symlink at the root of the repo
 _XLA_SRC_ROOT = pathlib.Path(__file__).absolute().parent
-_FIND_CUDA_CONFIG = str(
-    _XLA_SRC_ROOT
-    / "third_party"
-    / "tsl"
-    / "third_party"
-    / "gpus"
-    / "find_cuda_config.py"
-)
 _XLA_BAZELRC_NAME = "xla_configure.bazelrc"
 _KW_ONLY_IF_PYTHON310 = {"kw_only": True} if sys.version_info >= (3, 10) else {}
 
@@ -218,11 +205,9 @@ class DiscoverablePathsAndVersions:
   ld_library_path: Optional[str] = None
 
   # CUDA specific
-  cublas_version: Optional[str] = None
-  cuda_toolkit_path: Optional[str] = None
+  cuda_version: Optional[str] = None
   cuda_compute_capabilities: Optional[list[str]] = None
   cudnn_version: Optional[str] = None
-  nccl_version: Optional[str] = None
 
   def get_relevant_paths_and_versions(self, config: "XLAConfigOptions"):
     """Gets paths and versions as needed by the config.
@@ -241,7 +226,7 @@ def get_relevant_paths_and_versions(self, config: "XLAConfigOptions"):
       )
 
       # Notably, we don't use `_find_executable_or_die` for lld, as it changes
-      # which commands it accepts based on it's name! ld.lld is symlinked to a
+      # which commands it accepts based on its name! ld.lld is symlinked to a
       # different executable just called lld, which should not be invoked
       # directly.
       self.lld_path = self.lld_path or shutil.which("ld.lld")
@@ -255,64 +240,6 @@ def get_relevant_paths_and_versions(self, config: "XLAConfigOptions"):
       if not self.cuda_compute_capabilities:
         self.cuda_compute_capabilities = _get_cuda_compute_capabilities_or_die()
 
-      self._get_cuda_libraries_paths_and_versions_if_needed(config)
-
-  def _get_cuda_libraries_paths_and_versions_if_needed(
-      self, config: "XLAConfigOptions"
-  ):
-    """Gets cuda paths and versions if user left any unspecified.
-
-    This uses `find_cuda_config.py` to find versions for all libraries in
-    `_REQUIRED_CUDA_LIBRARIES`.
-
-    Args:
-      config: config that determines which libraries should be found.
-    """
-    should_find_nccl = config.using_nccl and self.nccl_version is None
-    any_cuda_config_unset = any([
-        self.cublas_version is None,
-        self.cuda_toolkit_path is None,
-        self.cudnn_version is None,
-        should_find_nccl,
-    ])
-
-    maybe_nccl = ["nccl"] if should_find_nccl else []
-
-    if any_cuda_config_unset:
-      logging.info(
-          "Some CUDA config versions and paths were not provided, "
-          "so trying to find them using find_cuda_config.py"
-      )
-      try:
-        find_cuda_config_proc = subprocess.run(
-            [
-                sys.executable,
-                _FIND_CUDA_CONFIG,
-                *_REQUIRED_CUDA_LIBRARIES,
-                *maybe_nccl,
-            ],
-            capture_output=True,
-            check=True,
-            text=True,
-        )
-      except subprocess.CalledProcessError as e:
-        logging.info("Command %s failed. Is CUDA installed?", e.cmd)
-        logging.info("Dumping %s ouptut:\n %s", e.cmd, e.output)
-        raise e
-
-      cuda_config = dict(
-          tuple(line.split(": "))
-          for line in find_cuda_config_proc.stdout.strip().split("\n")
-      )
-
-      self.cublas_version = self.cublas_version or cuda_config["cublas_version"]
-      self.cuda_toolkit_path = (
-          self.cuda_toolkit_path or cuda_config["cuda_toolkit_path"]
-      )
-      self.cudnn_version = self.cudnn_version or cuda_config["cudnn_version"]
-      if should_find_nccl:
-        self.nccl_version = self.nccl_version or cuda_config["nccl_version"]
-
 
 @dataclasses.dataclass(frozen=True, **_KW_ONLY_IF_PYTHON310)
 class XLAConfigOptions:
@@ -327,7 +254,6 @@ class XLAConfigOptions:
   # CUDA specific
   cuda_compiler: CudaCompiler
   using_nccl: bool
-  using_tensorrt: bool
 
   def to_bazelrc_lines(
       self,
@@ -386,19 +312,13 @@ def to_bazelrc_lines(
         )
 
       # Lines needed for CUDA backend regardless of CUDA/host compiler
-      rc.append(
-          f"build --action_env CUDA_TOOLKIT_PATH={dpav.cuda_toolkit_path}"
-      )
-      rc.append(f"build --action_env TF_CUBLAS_VERSION={dpav.cublas_version}")
+      rc.append(f"build --action_env TF_CUDA_VERSION={dpav.cuda_version}")
       rc.append(
           "build --action_env"
           f" TF_CUDA_COMPUTE_CAPABILITIES={','.join(dpav.cuda_compute_capabilities)}"
       )
       rc.append(f"build --action_env TF_CUDNN_VERSION={dpav.cudnn_version}")
-      rc.append(f"build --repo_env TF_NEED_TENSORRT={int(self.using_tensorrt)}")
-      if self.using_nccl:
-        rc.append(f"build --action_env TF_NCCL_VERSION={dpav.nccl_version}")
-      else:
+      if not self.using_nccl:
         rc.append("build --config nonccl")
     elif self.backend == Backend.ROCM:
       pass
@@ -468,7 +388,6 @@ def _parse_args():
       default="-Wno-sign-compare",
   )
   parser.add_argument("--nccl", action="store_true")
-  parser.add_argument("--tensorrt", action="store_true")
 
   # Path and version overrides
   path_help = "Optional: will be found on PATH if possible."
@@ -484,13 +403,16 @@ def _parse_args():
   parser.add_argument("--lld_path", help=path_help)
 
   # CUDA specific
-  find_cuda_config_help = (
-      "Optional: will be found using `find_cuda_config.py` if flag is not set."
+  parser.add_argument(
+      "--cuda_version",
+      help="Optional: CUDA will be downloaded by Bazel if the flag is provided",
+  )
+  parser.add_argument(
+      "--cudnn_version",
+      help=(
+          "Optional: CUDNN will be downloaded by Bazel if the flag is provided"
+      ),
   )
-  parser.add_argument("--cublas_version", help=find_cuda_config_help)
-  parser.add_argument("--cuda_toolkit_path", help=find_cuda_config_help)
-  parser.add_argument("--cudnn_version", help=find_cuda_config_help)
-  parser.add_argument("--nccl_version", help=find_cuda_config_help)
 
   return parser.parse_args()
 
@@ -510,7 +432,6 @@ def main():
       python_bin_path=args.python_bin_path,
       compiler_options=args.compiler_options,
       using_nccl=args.nccl,
-      using_tensorrt=args.tensorrt,
   )
 
   bazelrc_lines = config.to_bazelrc_lines(
@@ -519,11 +440,9 @@ def main():
           gcc_path=args.gcc_path,
           lld_path=args.lld_path,
           ld_library_path=args.ld_library_path,
-          cublas_version=args.cublas_version,
-          cuda_compute_capabilities=args.cuda_compute_capabilities,
-          cuda_toolkit_path=args.cuda_toolkit_path,
+          cuda_version=args.cuda_version,
           cudnn_version=args.cudnn_version,
-          nccl_version=args.nccl_version,
+          cuda_compute_capabilities=args.cuda_compute_capabilities,
       )
   )
 
diff --git a/third_party/xla/build_tools/configure/configure_test.py b/third_party/xla/build_tools/configure/configure_test.py
index c952c8f9241f4f..8a1ca1ab3c699f 100644
--- a/third_party/xla/build_tools/configure/configure_test.py
+++ b/third_party/xla/build_tools/configure/configure_test.py
@@ -32,12 +32,10 @@
 
 # CUDA specific paths and versions
 _CUDA_SPECIFIC_PATHS_AND_VERSIONS = {
-    "cublas_version": "12.3",
-    "cuda_toolkit_path": "/usr/local/cuda-12.2",
+    "cuda_version": "12.3",
     "cuda_compute_capabilities": ["7.5"],
     "cudnn_version": "8",
     "ld_library_path": "/usr/local/nvidia/lib:/usr/local/nvidia/lib64",
-    "nccl_version": "2",
 }
 
 
@@ -75,7 +73,6 @@ def test_clang_bazelrc(self):
         compiler_options=list(_COMPILER_OPTIONS),
         cuda_compiler=CudaCompiler.NVCC,
         using_nccl=False,
-        using_tensorrt=False,
     )
 
     bazelrc_lines = config.to_bazelrc_lines(
@@ -97,7 +94,6 @@ def test_gcc_bazelrc(self):
         compiler_options=list(_COMPILER_OPTIONS),
         cuda_compiler=CudaCompiler.NVCC,
         using_nccl=False,
-        using_tensorrt=False,
     )
 
     bazelrc_lines = config.to_bazelrc_lines(
@@ -118,7 +114,6 @@ def test_cuda_clang_bazelrc(self):
         compiler_options=list(_COMPILER_OPTIONS),
         cuda_compiler=CudaCompiler.CLANG,
         using_nccl=False,
-        using_tensorrt=False,
     )
 
     bazelrc_lines = config.to_bazelrc_lines(
@@ -140,7 +135,6 @@ def test_nvcc_clang_bazelrc(self):
         compiler_options=list(_COMPILER_OPTIONS),
         cuda_compiler=CudaCompiler.NVCC,
         using_nccl=False,
-        using_tensorrt=False,
     )
 
     bazelrc_lines = config.to_bazelrc_lines(
@@ -162,7 +156,6 @@ def test_nvcc_gcc_bazelrc(self):
         compiler_options=list(_COMPILER_OPTIONS),
         cuda_compiler=CudaCompiler.NVCC,
         using_nccl=False,
-        using_tensorrt=False,
     )
 
     bazelrc_lines = config.to_bazelrc_lines(
diff --git a/third_party/xla/build_tools/configure/testdata/cuda_clang.bazelrc b/third_party/xla/build_tools/configure/testdata/cuda_clang.bazelrc
index b998cf06935f33..62c5224a98dd19 100644
--- a/third_party/xla/build_tools/configure/testdata/cuda_clang.bazelrc
+++ b/third_party/xla/build_tools/configure/testdata/cuda_clang.bazelrc
@@ -3,11 +3,9 @@ build --repo_env CC=/usr/lib/llvm-17/bin/clang
 build --repo_env BAZEL_COMPILER=/usr/lib/llvm-17/bin/clang
 build --config cuda_clang
 build --action_env CLANG_CUDA_COMPILER_PATH=/usr/lib/llvm-17/bin/clang
-build --action_env CUDA_TOOLKIT_PATH=/usr/local/cuda-12.2
-build --action_env TF_CUBLAS_VERSION=12.3
+build --action_env TF_CUDA_VERSION=12.3
 build --action_env TF_CUDA_COMPUTE_CAPABILITIES=7.5
 build --action_env TF_CUDNN_VERSION=8
-build --repo_env TF_NEED_TENSORRT=0
 build --config nonccl
 build --action_env LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 build --action_env PYTHON_BIN_PATH=/usr/bin/python3
diff --git a/third_party/xla/build_tools/configure/testdata/nvcc_clang.bazelrc b/third_party/xla/build_tools/configure/testdata/nvcc_clang.bazelrc
index 912dc50faff4c1..ab3d220cdea804 100644
--- a/third_party/xla/build_tools/configure/testdata/nvcc_clang.bazelrc
+++ b/third_party/xla/build_tools/configure/testdata/nvcc_clang.bazelrc
@@ -3,11 +3,9 @@ build --repo_env CC=/usr/lib/llvm-17/bin/clang
 build --repo_env BAZEL_COMPILER=/usr/lib/llvm-17/bin/clang
 build --config nvcc_clang
 build --action_env CLANG_CUDA_COMPILER_PATH=/usr/lib/llvm-17/bin/clang
-build --action_env CUDA_TOOLKIT_PATH=/usr/local/cuda-12.2
-build --action_env TF_CUBLAS_VERSION=12.3
+build --action_env TF_CUDA_VERSION=12.3
 build --action_env TF_CUDA_COMPUTE_CAPABILITIES=7.5
 build --action_env TF_CUDNN_VERSION=8
-build --repo_env TF_NEED_TENSORRT=0
 build --config nonccl
 build --action_env LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 build --action_env PYTHON_BIN_PATH=/usr/bin/python3
diff --git a/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc b/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc
index 863209697362de..e27b41ffc01a99 100644
--- a/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc
+++ b/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc
@@ -1,10 +1,8 @@
 build --action_env GCC_HOST_COMPILER_PATH=/usr/bin/gcc
 build --config cuda
-build --action_env CUDA_TOOLKIT_PATH=/usr/local/cuda-12.2
-build --action_env TF_CUBLAS_VERSION=12.3
+build --action_env TF_CUDA_VERSION=12.3
 build --action_env TF_CUDA_COMPUTE_CAPABILITIES=7.5
 build --action_env TF_CUDNN_VERSION=8
-build --repo_env TF_NEED_TENSORRT=0
 build --config nonccl
 build --action_env LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 build --action_env PYTHON_BIN_PATH=/usr/bin/python3
diff --git a/third_party/xla/opensource_only.files b/third_party/xla/opensource_only.files
index 7655cabdafeb6b..3a7d2c1499c8b2 100644
--- a/third_party/xla/opensource_only.files
+++ b/third_party/xla/opensource_only.files
@@ -9,6 +9,8 @@ third_party/BUILD:
 third_party/__init__:.py
 third_party/compute_library/BUILD:
 third_party/compute_library/build_defs.bzl:
+third_party/cuda_redist_json_repo.bzl:
+third_party/cuda_repo.bzl:
 third_party/implib_so/BUILD:
 third_party/implib_so/get_symbols.py:
 third_party/implib_so/make_stub.py:
diff --git a/third_party/xla/third_party/cuda_redist_json_repo.bzl b/third_party/xla/third_party/cuda_redist_json_repo.bzl
new file mode 100644
index 00000000000000..76941dd74c9488
--- /dev/null
+++ b/third_party/xla/third_party/cuda_redist_json_repo.bzl
@@ -0,0 +1,110 @@
+# Copyright 2024 The TensorFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for defining CUDA and cuDNN JSON files with distributives versions."""
+
+load("//third_party:repo.bzl", "tf_mirror_urls")
+
+_DEFAULT_CUDA_VERSION = "12.3.2"
+_DEFAULT_CUDNN_VERSION = "8.9.7.29"
+
+def _get_env_var(ctx, name):
+    if name in ctx.os.environ:
+        return ctx.os.environ[name]
+    else:
+        return None
+
+def _cuda_redist_json_impl(repository_ctx):
+    cuda_version = _get_env_var(repository_ctx, "TF_CUDA_VERSION")
+    cudnn_version = _get_env_var(repository_ctx, "TF_CUDNN_VERSION")
+    supported_cuda_versions = repository_ctx.attr.cuda_json_dict.keys()
+    if cuda_version and (cuda_version not in supported_cuda_versions):
+        if cuda_version in ["12", "12.3"]:
+            cuda_version = _DEFAULT_CUDA_VERSION
+        else:
+            fail(
+                ("The supported CUDA versions are {supported_versions}." +
+                 " Please provide a supported version in TF_CUDA_VERSION" +
+                 " environment variable or add JSON URL for" +
+                 " CUDA version={version}.")
+                    .format(
+                    supported_versions = supported_cuda_versions,
+                    version = cuda_version,
+                ),
+            )
+    supported_cudnn_versions = repository_ctx.attr.cudnn_json_dict.keys()
+    if cudnn_version and (cudnn_version not in supported_cudnn_versions):
+        if cudnn_version in ["8", "8.9"]:
+            cudnn_version = _DEFAULT_CUDNN_VERSION
+        else:
+            fail(
+                ("The supported CUDNN versions are {supported_versions}." +
+                 " Please provide a supported version in TF_CUDNN_VERSION" +
+                 " environment variable or add JSON URL for" +
+                 " CUDNN version={version}.")
+                    .format(
+                    supported_versions = supported_cudnn_versions,
+                    version = cudnn_version,
+                ),
+            )
+    cuda_distributives = "{}"
+    cudnn_distributives = "{}"
+    if cuda_version:
+        (url, sha256) = repository_ctx.attr.cuda_json_dict[cuda_version]
+        json_file_name = "redistrib_cuda_%s.json" % cuda_version
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            sha256 = sha256,
+            output = json_file_name,
+        )
+        cuda_distributives = repository_ctx.read(repository_ctx.path(json_file_name))
+    if cudnn_version:
+        (url, sha256) = repository_ctx.attr.cudnn_json_dict[cudnn_version]
+        json_file_name = "redistrib_cudnn_%s.json" % cudnn_version
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            sha256 = sha256,
+            output = json_file_name,
+        )
+        cudnn_distributives = repository_ctx.read(repository_ctx.path(json_file_name))
+
+    repository_ctx.file(
+        "build_defs.bzl",
+        """def get_cuda_distributives():
+               return {cuda_distributives}
+
+def get_cudnn_distributives():
+    return {cudnn_distributives}
+""".format(cuda_distributives = cuda_distributives, cudnn_distributives = cudnn_distributives),
+    )
+    repository_ctx.file(
+        "BUILD",
+        "",
+    )
+
+_cuda_redist_json = repository_rule(
+    implementation = _cuda_redist_json_impl,
+    attrs = {
+        "cuda_json_dict": attr.string_list_dict(mandatory = True),
+        "cudnn_json_dict": attr.string_list_dict(mandatory = True),
+    },
+    environ = ["TF_CUDA_VERSION", "TF_CUDNN_VERSION"],
+)
+
+def cuda_redist_json(name, cuda_json_dict, cudnn_json_dict):
+    _cuda_redist_json(
+        name = name,
+        cuda_json_dict = cuda_json_dict,
+        cudnn_json_dict = cudnn_json_dict,
+    )
diff --git a/third_party/xla/third_party/cuda_repo.bzl b/third_party/xla/third_party/cuda_repo.bzl
new file mode 100644
index 00000000000000..0dcda26cba3a23
--- /dev/null
+++ b/third_party/xla/third_party/cuda_repo.bzl
@@ -0,0 +1,327 @@
+# Copyright 2024 The TensorFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for defining Google ML CUDA dependencies."""
+
+load("@cuda_redist_json//:build_defs.bzl", "get_cuda_distributives", "get_cudnn_distributives")
+load("//third_party:repo.bzl", "tf_mirror_urls")
+
+_DEFAULT_CUDA_VERSION = "12.3.2"
+_OS_ARCH_DICT = {
+    "amd64": "x86_64-unknown-linux-gnu",
+    "aarch64": "aarch64-unknown-linux-gnu",
+}
+_REDIST_ARCH_DICT = {
+    "linux-x86_64": "x86_64-unknown-linux-gnu",
+    "linux-sbsa": "aarch64-unknown-linux-gnu",
+}
+
+_CUDA_DIST_PATH_PREFIX = "https://developer.download.nvidia.com/compute/cuda/redist/"
+_CUDNN_DIST_PATH_PREFIX = "https://developer.download.nvidia.com/compute/cudnn/redist/"
+
+def _get_env_var(ctx, name):
+    if name in ctx.os.environ:
+        return ctx.os.environ[name]
+    else:
+        return None
+
+def _get_archive_name(url, archive_suffix = ".tar.xz"):
+    last_slash_index = url.rfind("/")
+    return url[last_slash_index + 1:-len(archive_suffix)]
+
+def _cuda_http_archive_impl(repository_ctx):
+    cuda_or_cudnn_version = None
+    dist_version = ""
+    cuda_version = _get_env_var(repository_ctx, "TF_CUDA_VERSION")
+    cudnn_version = _get_env_var(repository_ctx, "TF_CUDNN_VERSION")
+    if repository_ctx.attr.is_cudnn_dist:
+        cuda_or_cudnn_version = cudnn_version
+    else:
+        cuda_or_cudnn_version = cuda_version
+    if cuda_or_cudnn_version:
+        # Download archive only when GPU config is used.
+        dist_version = repository_ctx.attr.dist_version
+        arch = _OS_ARCH_DICT[repository_ctx.os.arch]
+        if arch not in repository_ctx.attr.relative_url_dict.keys():
+            (relative_url, sha256) = repository_ctx.attr.relative_url_dict["cuda{version}_{arch}" \
+                .format(
+                version = cuda_version.split(".")[0],
+                arch = arch,
+            )]
+        else:
+            (relative_url, sha256) = repository_ctx.attr.relative_url_dict[arch]
+        url = (repository_ctx.attr.cudnn_dist_path_prefix if repository_ctx.attr.is_cudnn_dist else repository_ctx.attr.cuda_dist_path_prefix) + relative_url
+
+        archive_name = _get_archive_name(url)
+
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            output = archive_name + repository_ctx.attr.extension,
+            sha256 = sha256,
+        )
+        repository_ctx.extract(
+            archive = archive_name + repository_ctx.attr.extension,
+            stripPrefix = repository_ctx.attr.override_strip_prefix if repository_ctx.attr.override_strip_prefix else archive_name,
+        )
+    if repository_ctx.attr.build_template:
+        version = dist_version.split(".")[0] if dist_version else ""
+        repository_ctx.file("version.txt", version)
+        repository_ctx.template(
+            "BUILD",
+            repository_ctx.attr.build_template,
+            {"%{version}": version},
+        )
+    else:
+        repository_ctx.file(
+            "BUILD",
+            repository_ctx.read(repository_ctx.attr.build_file),
+        )
+
+_cuda_http_archive = repository_rule(
+    implementation = _cuda_http_archive_impl,
+    attrs = {
+        "dist_version": attr.string(mandatory = True),
+        "relative_url_dict": attr.string_list_dict(mandatory = True),
+        "build_template": attr.label(),
+        "build_file": attr.label(),
+        "is_cudnn_dist": attr.bool(),
+        "override_strip_prefix": attr.string(),
+        "cudnn_dist_path_prefix": attr.string(default = _CUDNN_DIST_PATH_PREFIX),
+        "cuda_dist_path_prefix": attr.string(default = _CUDA_DIST_PATH_PREFIX),
+        "extension": attr.string(default = ".tar.xz"),
+    },
+    environ = ["TF_CUDA_VERSION", "TF_CUDNN_VERSION"],
+)
+
+def cuda_http_archive(name, dist_version, relative_url_dict, **kwargs):
+    _cuda_http_archive(
+        name = name,
+        dist_version = dist_version,
+        relative_url_dict = relative_url_dict,
+        **kwargs
+    )
+
+def _cuda_wheel_impl(repository_ctx):
+    cuda_version = _get_env_var(repository_ctx, "TF_CUDA_VERSION")
+    if cuda_version in ["12", "12.3"]:
+        cuda_version = _DEFAULT_CUDA_VERSION
+    if cuda_version:
+        # Download archive only when GPU config is used.
+        arch = _OS_ARCH_DICT[repository_ctx.os.arch]
+        dict_key = "{cuda_version}-{arch}".format(
+            cuda_version = cuda_version,
+            arch = arch,
+        )
+        supported_versions = repository_ctx.attr.url_dict.keys()
+        if dict_key not in supported_versions:
+            fail(
+                ("The supported NCCL versions are {supported_versions}." +
+                 " Please provide a supported CUDA version in TF_CUDA_VERSION" +
+                 " environment variable or add NCCL distributive for" +
+                 " CUDA version={version}, OS={arch}.")
+                    .format(
+                    supported_versions = supported_versions,
+                    version = cuda_version,
+                    arch = arch,
+                ),
+            )
+        sha256 = repository_ctx.attr.sha256_dict[dict_key]
+        url = repository_ctx.attr.url_dict[dict_key]
+
+        archive_name = _get_archive_name(url, archive_suffix = ".whl")
+
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            output = archive_name + repository_ctx.attr.extension,
+            sha256 = sha256,
+        )
+        repository_ctx.extract(
+            archive = archive_name + repository_ctx.attr.extension,
+            stripPrefix = repository_ctx.attr.strip_prefix,
+        )
+
+    repository_ctx.file(
+        "BUILD",
+        repository_ctx.read(repository_ctx.attr.build_file),
+    )
+
+_cuda_wheel = repository_rule(
+    implementation = _cuda_wheel_impl,
+    attrs = {
+        "sha256_dict": attr.string_dict(mandatory = True),
+        "url_dict": attr.string_dict(mandatory = True),
+        "build_file": attr.label(),
+        "strip_prefix": attr.string(),
+        "extension": attr.string(default = ".zip"),
+    },
+    environ = ["TF_CUDA_VERSION"],
+)
+
+def cuda_wheel(name, sha256_dict, url_dict, **kwargs):
+    _cuda_wheel(
+        name = name,
+        sha256_dict = sha256_dict,
+        url_dict = url_dict,
+        **kwargs
+    )
+
+def _get_relative_url_dict(dist_info):
+    relative_url_dict = {}
+    for arch in _REDIST_ARCH_DICT.keys():
+        # CUDNN JSON might contain paths for each CUDA version.
+        if "relative_path" not in dist_info[arch]:
+            for cuda_version, data in dist_info[arch].items():
+                relative_url_dict["{cuda_version}_{arch}" \
+                    .format(
+                    cuda_version = cuda_version,
+                    arch = _REDIST_ARCH_DICT[arch],
+                )] = [data["relative_path"], data["sha256"]]
+        else:
+            relative_url_dict[_REDIST_ARCH_DICT[arch]] = [
+                dist_info[arch]["relative_path"],
+                dist_info[arch]["sha256"],
+            ]
+    return relative_url_dict
+
+def _get_cuda_archive(
+        repo_name,
+        dist_dict,
+        dist_name,
+        build_file = None,
+        build_template = None,
+        is_cudnn_dist = False):
+    if dist_name in dist_dict.keys():
+        return cuda_http_archive(
+            name = repo_name,
+            dist_version = dist_dict[dist_name]["version"],
+            build_file = Label(build_file) if build_file else None,
+            build_template = Label(build_template) if build_template else None,
+            relative_url_dict = _get_relative_url_dict(dist_dict[dist_name]),
+            is_cudnn_dist = is_cudnn_dist,
+        )
+    else:
+        return cuda_http_archive(
+            name = repo_name,
+            dist_version = "",
+            build_file = Label(build_file) if build_file else None,
+            build_template = Label(build_template) if build_template else None,
+            relative_url_dict = {"": []},
+            is_cudnn_dist = is_cudnn_dist,
+        )
+
+def cuda_distributives(cuda_nccl_wheel_dict):
+    nccl_artifacts_dict = {"sha256_dict": {}, "url_dict": {}}
+    for cuda_version, nccl_wheel_info in cuda_nccl_wheel_dict.items():
+        for arch in _OS_ARCH_DICT.values():
+            if arch in nccl_wheel_info.keys():
+                cuda_version_to_arch_key = "%s-%s" % (cuda_version, arch)
+                nccl_artifacts_dict["sha256_dict"][cuda_version_to_arch_key] = nccl_wheel_info[arch]["sha256"]
+                nccl_artifacts_dict["url_dict"][cuda_version_to_arch_key] = nccl_wheel_info[arch]["url"]
+
+    cuda_wheel(
+        name = "cuda_nccl",
+        sha256_dict = nccl_artifacts_dict["sha256_dict"],
+        url_dict = nccl_artifacts_dict["url_dict"],
+        build_file = Label("//third_party/gpus/cuda:cuda_nccl.BUILD"),
+        strip_prefix = "nvidia/nccl",
+    )
+
+    cuda_distributives = get_cuda_distributives()
+    cudnn_distributives = get_cudnn_distributives()
+
+    _get_cuda_archive(
+        repo_name = "cuda_cccl",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_cccl",
+        build_file = "//third_party/gpus/cuda:cuda_cccl.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cublas",
+        dist_dict = cuda_distributives,
+        dist_name = "libcublas",
+        build_template = "//third_party/gpus/cuda:cuda_cublas.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cudart",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_cudart",
+        build_template = "//third_party/gpus/cuda:cuda_cudart.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cudnn",
+        dist_dict = cudnn_distributives,
+        dist_name = "cudnn",
+        build_template = "//third_party/gpus/cuda:cuda_cudnn.BUILD.tpl",
+        is_cudnn_dist = True,
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cufft",
+        dist_dict = cuda_distributives,
+        dist_name = "libcufft",
+        build_template = "//third_party/gpus/cuda:cuda_cufft.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cupti",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_cupti",
+        build_template = "//third_party/gpus/cuda:cuda_cupti.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_curand",
+        dist_dict = cuda_distributives,
+        dist_name = "libcurand",
+        build_template = "//third_party/gpus/cuda:cuda_curand.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cusolver",
+        dist_dict = cuda_distributives,
+        dist_name = "libcusolver",
+        build_template = "//third_party/gpus/cuda:cuda_cusolver.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cusparse",
+        dist_dict = cuda_distributives,
+        dist_name = "libcusparse",
+        build_template = "//third_party/gpus/cuda:cuda_cusparse.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvcc",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvcc",
+        build_file = "//third_party/gpus/cuda:cuda_nvcc.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvjitlink",
+        dist_dict = cuda_distributives,
+        dist_name = "libnvjitlink",
+        build_template = "//third_party/gpus/cuda:cuda_nvjitlink.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvml",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvml_dev",
+        build_file = "//third_party/gpus/cuda:cuda_nvml.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvprune",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvprune",
+        build_file = "//third_party/gpus/cuda:cuda_nvprune.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvtx",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvtx",
+        build_file = "//third_party/gpus/cuda:cuda_nvtx.BUILD",
+    )
diff --git a/third_party/xla/third_party/tsl/.bazelrc b/third_party/xla/third_party/tsl/.bazelrc
index 02dec0349c4741..c17ae4494dc99c 100644
--- a/third_party/xla/third_party/tsl/.bazelrc
+++ b/third_party/xla/third_party/tsl/.bazelrc
@@ -226,12 +226,13 @@ build:cuda --repo_env TF_NEED_CUDA=1
 build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda --@local_config_cuda//:enable_cuda
 
+build:no_cuda_libs --@local_config_cuda//cuda:include_hermetic_cuda_libs=false
+
 # CUDA: This config refers to building CUDA op kernels with clang.
 build:cuda_clang --config=cuda
-# Enable TensorRT optimizations https://developer.nvidia.com/tensorrt
-build:cuda_clang --config=tensorrt
 build:cuda_clang --action_env=TF_CUDA_CLANG="1"
 build:cuda_clang --@local_config_cuda//:cuda_compiler=clang
+build:cuda_clang --copt=-Qunused-arguments
 # Select supported compute capabilities (supported graphics cards).
 # This is the same as the official TensorFlow builds.
 # See https://developer.nvidia.com/cuda-gpus#compute
@@ -244,12 +245,10 @@ build:cuda_clang --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_60,sm_70,sm_80,sm_8
 
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 build:cuda_clang_official --config=cuda_clang
-build:cuda_clang_official --action_env=TF_CUDA_VERSION="12"
-build:cuda_clang_official --action_env=TF_CUDNN_VERSION="8"
-build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.3"
+build:cuda_clang_official --action_env=TF_CUDA_VERSION="12.3"
+build:cuda_clang_official --action_env=TF_CUDNN_VERSION="8.9"
 build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
-build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 build:cuda_clang_official --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
 
 # Build with nvcc for CUDA and clang for host
@@ -545,10 +544,6 @@ build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
 # For Remote build execution -- GPU configuration
 build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
-build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.17-clang_config_cuda"
-build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.17-clang_config_tensorrt"
-build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.17-clang_config_nccl"
-test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
 build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
 build:rbe_linux_cuda_nvcc --config=nvcc_clang
@@ -633,7 +628,6 @@ build:release_cpu_linux_base --repo_env=BAZEL_COMPILER="/usr/lib/llvm-17/bin/cla
 # Test-related settings below this point.
 test:release_linux_base --build_tests_only --keep_going --test_output=errors --verbose_failures=true
 test:release_linux_base --local_test_jobs=HOST_CPUS
-test:release_linux_base --test_env=LD_LIBRARY_PATH
 # Give only the list of failed tests at the end of the log
 test:release_linux_base --test_summary=short
 
@@ -645,7 +639,6 @@ build:release_gpu_linux --config=release_cpu_linux
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 # Note that linux cpu and cuda builds share the same toolchain now.
 build:release_gpu_linux --config=cuda_clang_official
-test:release_gpu_linux --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 # Local test jobs has to be 4 because parallel_gpu_execute is fragile, I think
 test:release_gpu_linux --test_timeout=300,450,1200,3600 --local_test_jobs=4 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute
 
@@ -674,11 +667,8 @@ test:unsupported_cpu_linux --config=release_base
 build:unsupported_gpu_linux --config=cuda
 build:unsupported_gpu_linux --config=unsupported_cpu_linux
 build:unsupported_gpu_linux --action_env=TF_CUDA_VERSION="11"
-build:unsupported_gpu_linux --action_env=TF_CUDNN_VERSION="8"
+build:unsupported_gpu_linux --action_env=TF_CUDNN_VERSION="8.6"
 build:unsupported_gpu_linux --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_50,sm_60,sm_70,sm_75,compute_80"
-build:unsupported_gpu_linux --config=tensorrt
-build:unsupported_gpu_linux --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-11.2"
-build:unsupported_gpu_linux --action_env=LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.1/lib64:/usr/local/tensorrt/lib"
 build:unsupported_gpu_linux --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:unsupported_gpu_linux --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain
 
diff --git a/third_party/xla/third_party/tsl/opensource_only.files b/third_party/xla/third_party/tsl/opensource_only.files
index 3f2bcf8431edc0..a789348e6f50c7 100644
--- a/third_party/xla/third_party/tsl/opensource_only.files
+++ b/third_party/xla/third_party/tsl/opensource_only.files
@@ -6,6 +6,8 @@ third_party/clang_toolchain/cc_configure_clang.bzl:
 third_party/clang_toolchain/download_clang.bzl:
 third_party/compute_library/BUILD:
 third_party/compute_library/build_defs.bzl:
+third_party/cuda_redist_json_repo.bzl:
+third_party/cuda_repo.bzl:
 third_party/curl.BUILD:
 third_party/cython.BUILD:
 third_party/ducc/BUILD:
@@ -21,6 +23,7 @@ third_party/git/BUILD.tpl:
 third_party/git/BUILD:
 third_party/git/git_configure.bzl:
 third_party/gpus/BUILD:
+third_party/gpus/compiler_common_tools.bzl:
 third_party/gpus/crosstool/BUILD.rocm.tpl:
 third_party/gpus/crosstool/BUILD.sycl.tpl:
 third_party/gpus/crosstool/BUILD.tpl:
@@ -31,15 +34,32 @@ third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl:
 third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_sycl.tpl:
 third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl:
 third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl:
+third_party/gpus/cuda/BUILD.hermetic.tpl:
 third_party/gpus/cuda/BUILD.tpl:
 third_party/gpus/cuda/BUILD.windows.tpl:
 third_party/gpus/cuda/BUILD:
 third_party/gpus/cuda/LICENSE:
 third_party/gpus/cuda/build_defs.bzl.tpl:
+third_party/gpus/cuda/cuda_cccl.BUILD:
 third_party/gpus/cuda/cuda_config.h.tpl:
 third_party/gpus/cuda/cuda_config.py.tpl:
+third_party/gpus/cuda/cuda_cublas.BUILD.tpl:
+third_party/gpus/cuda/cuda_cudart.BUILD.tpl:
+third_party/gpus/cuda/cuda_cudnn.BUILD.tpl:
+third_party/gpus/cuda/cuda_cufft.BUILD.tpl:
+third_party/gpus/cuda/cuda_cupti.BUILD.tpl:
+third_party/gpus/cuda/cuda_curand.BUILD.tpl:
+third_party/gpus/cuda/cuda_cusolver.BUILD.tpl:
+third_party/gpus/cuda/cuda_cusparse.BUILD.tpl:
+third_party/gpus/cuda/cuda_nccl.BUILD:
+third_party/gpus/cuda/cuda_nvcc.BUILD:
+third_party/gpus/cuda/cuda_nvjitlink.BUILD.tpl:
+third_party/gpus/cuda/cuda_nvml.BUILD:
+third_party/gpus/cuda/cuda_nvprune.BUILD:
+third_party/gpus/cuda/cuda_nvtx.BUILD:
 third_party/gpus/cuda_configure.bzl:
 third_party/gpus/find_cuda_config:.py
+third_party/gpus/hermetic_cuda_configure.bzl:
 third_party/gpus/rocm/BUILD.tpl:
 third_party/gpus/rocm/BUILD:
 third_party/gpus/rocm/build_defs.bzl.tpl:
@@ -68,6 +88,7 @@ third_party/nccl/archive.BUILD:
 third_party/nccl/archive.patch:
 third_party/nccl/build_defs.bzl.tpl:
 third_party/nccl/generated_names.bzl.tpl:
+third_party/nccl/hermetic_nccl_configure.bzl:
 third_party/nccl/nccl_configure.bzl:
 third_party/nccl/system.BUILD.tpl:
 third_party/nvtx/BUILD:
diff --git a/third_party/xla/third_party/tsl/third_party/cuda_redist_json_repo.bzl b/third_party/xla/third_party/tsl/third_party/cuda_redist_json_repo.bzl
new file mode 100644
index 00000000000000..76941dd74c9488
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/cuda_redist_json_repo.bzl
@@ -0,0 +1,110 @@
+# Copyright 2024 The TensorFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for defining CUDA and cuDNN JSON files with distributives versions."""
+
+load("//third_party:repo.bzl", "tf_mirror_urls")
+
+_DEFAULT_CUDA_VERSION = "12.3.2"
+_DEFAULT_CUDNN_VERSION = "8.9.7.29"
+
+def _get_env_var(ctx, name):
+    if name in ctx.os.environ:
+        return ctx.os.environ[name]
+    else:
+        return None
+
+def _cuda_redist_json_impl(repository_ctx):
+    cuda_version = _get_env_var(repository_ctx, "TF_CUDA_VERSION")
+    cudnn_version = _get_env_var(repository_ctx, "TF_CUDNN_VERSION")
+    supported_cuda_versions = repository_ctx.attr.cuda_json_dict.keys()
+    if cuda_version and (cuda_version not in supported_cuda_versions):
+        if cuda_version in ["12", "12.3"]:
+            cuda_version = _DEFAULT_CUDA_VERSION
+        else:
+            fail(
+                ("The supported CUDA versions are {supported_versions}." +
+                 " Please provide a supported version in TF_CUDA_VERSION" +
+                 " environment variable or add JSON URL for" +
+                 " CUDA version={version}.")
+                    .format(
+                    supported_versions = supported_cuda_versions,
+                    version = cuda_version,
+                ),
+            )
+    supported_cudnn_versions = repository_ctx.attr.cudnn_json_dict.keys()
+    if cudnn_version and (cudnn_version not in supported_cudnn_versions):
+        if cudnn_version in ["8", "8.9"]:
+            cudnn_version = _DEFAULT_CUDNN_VERSION
+        else:
+            fail(
+                ("The supported CUDNN versions are {supported_versions}." +
+                 " Please provide a supported version in TF_CUDNN_VERSION" +
+                 " environment variable or add JSON URL for" +
+                 " CUDNN version={version}.")
+                    .format(
+                    supported_versions = supported_cudnn_versions,
+                    version = cudnn_version,
+                ),
+            )
+    cuda_distributives = "{}"
+    cudnn_distributives = "{}"
+    if cuda_version:
+        (url, sha256) = repository_ctx.attr.cuda_json_dict[cuda_version]
+        json_file_name = "redistrib_cuda_%s.json" % cuda_version
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            sha256 = sha256,
+            output = json_file_name,
+        )
+        cuda_distributives = repository_ctx.read(repository_ctx.path(json_file_name))
+    if cudnn_version:
+        (url, sha256) = repository_ctx.attr.cudnn_json_dict[cudnn_version]
+        json_file_name = "redistrib_cudnn_%s.json" % cudnn_version
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            sha256 = sha256,
+            output = json_file_name,
+        )
+        cudnn_distributives = repository_ctx.read(repository_ctx.path(json_file_name))
+
+    repository_ctx.file(
+        "build_defs.bzl",
+        """def get_cuda_distributives():
+               return {cuda_distributives}
+
+def get_cudnn_distributives():
+    return {cudnn_distributives}
+""".format(cuda_distributives = cuda_distributives, cudnn_distributives = cudnn_distributives),
+    )
+    repository_ctx.file(
+        "BUILD",
+        "",
+    )
+
+_cuda_redist_json = repository_rule(
+    implementation = _cuda_redist_json_impl,
+    attrs = {
+        "cuda_json_dict": attr.string_list_dict(mandatory = True),
+        "cudnn_json_dict": attr.string_list_dict(mandatory = True),
+    },
+    environ = ["TF_CUDA_VERSION", "TF_CUDNN_VERSION"],
+)
+
+def cuda_redist_json(name, cuda_json_dict, cudnn_json_dict):
+    _cuda_redist_json(
+        name = name,
+        cuda_json_dict = cuda_json_dict,
+        cudnn_json_dict = cudnn_json_dict,
+    )
diff --git a/third_party/xla/third_party/tsl/third_party/cuda_repo.bzl b/third_party/xla/third_party/tsl/third_party/cuda_repo.bzl
new file mode 100644
index 00000000000000..0dcda26cba3a23
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/cuda_repo.bzl
@@ -0,0 +1,327 @@
+# Copyright 2024 The TensorFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for defining Google ML CUDA dependencies."""
+
+load("@cuda_redist_json//:build_defs.bzl", "get_cuda_distributives", "get_cudnn_distributives")
+load("//third_party:repo.bzl", "tf_mirror_urls")
+
+_DEFAULT_CUDA_VERSION = "12.3.2"
+_OS_ARCH_DICT = {
+    "amd64": "x86_64-unknown-linux-gnu",
+    "aarch64": "aarch64-unknown-linux-gnu",
+}
+_REDIST_ARCH_DICT = {
+    "linux-x86_64": "x86_64-unknown-linux-gnu",
+    "linux-sbsa": "aarch64-unknown-linux-gnu",
+}
+
+_CUDA_DIST_PATH_PREFIX = "https://developer.download.nvidia.com/compute/cuda/redist/"
+_CUDNN_DIST_PATH_PREFIX = "https://developer.download.nvidia.com/compute/cudnn/redist/"
+
+def _get_env_var(ctx, name):
+    if name in ctx.os.environ:
+        return ctx.os.environ[name]
+    else:
+        return None
+
+def _get_archive_name(url, archive_suffix = ".tar.xz"):
+    last_slash_index = url.rfind("/")
+    return url[last_slash_index + 1:-len(archive_suffix)]
+
+def _cuda_http_archive_impl(repository_ctx):
+    cuda_or_cudnn_version = None
+    dist_version = ""
+    cuda_version = _get_env_var(repository_ctx, "TF_CUDA_VERSION")
+    cudnn_version = _get_env_var(repository_ctx, "TF_CUDNN_VERSION")
+    if repository_ctx.attr.is_cudnn_dist:
+        cuda_or_cudnn_version = cudnn_version
+    else:
+        cuda_or_cudnn_version = cuda_version
+    if cuda_or_cudnn_version:
+        # Download archive only when GPU config is used.
+        dist_version = repository_ctx.attr.dist_version
+        arch = _OS_ARCH_DICT[repository_ctx.os.arch]
+        if arch not in repository_ctx.attr.relative_url_dict.keys():
+            (relative_url, sha256) = repository_ctx.attr.relative_url_dict["cuda{version}_{arch}" \
+                .format(
+                version = cuda_version.split(".")[0],
+                arch = arch,
+            )]
+        else:
+            (relative_url, sha256) = repository_ctx.attr.relative_url_dict[arch]
+        url = (repository_ctx.attr.cudnn_dist_path_prefix if repository_ctx.attr.is_cudnn_dist else repository_ctx.attr.cuda_dist_path_prefix) + relative_url
+
+        archive_name = _get_archive_name(url)
+
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            output = archive_name + repository_ctx.attr.extension,
+            sha256 = sha256,
+        )
+        repository_ctx.extract(
+            archive = archive_name + repository_ctx.attr.extension,
+            stripPrefix = repository_ctx.attr.override_strip_prefix if repository_ctx.attr.override_strip_prefix else archive_name,
+        )
+    if repository_ctx.attr.build_template:
+        version = dist_version.split(".")[0] if dist_version else ""
+        repository_ctx.file("version.txt", version)
+        repository_ctx.template(
+            "BUILD",
+            repository_ctx.attr.build_template,
+            {"%{version}": version},
+        )
+    else:
+        repository_ctx.file(
+            "BUILD",
+            repository_ctx.read(repository_ctx.attr.build_file),
+        )
+
+_cuda_http_archive = repository_rule(
+    implementation = _cuda_http_archive_impl,
+    attrs = {
+        "dist_version": attr.string(mandatory = True),
+        "relative_url_dict": attr.string_list_dict(mandatory = True),
+        "build_template": attr.label(),
+        "build_file": attr.label(),
+        "is_cudnn_dist": attr.bool(),
+        "override_strip_prefix": attr.string(),
+        "cudnn_dist_path_prefix": attr.string(default = _CUDNN_DIST_PATH_PREFIX),
+        "cuda_dist_path_prefix": attr.string(default = _CUDA_DIST_PATH_PREFIX),
+        "extension": attr.string(default = ".tar.xz"),
+    },
+    environ = ["TF_CUDA_VERSION", "TF_CUDNN_VERSION"],
+)
+
+def cuda_http_archive(name, dist_version, relative_url_dict, **kwargs):
+    _cuda_http_archive(
+        name = name,
+        dist_version = dist_version,
+        relative_url_dict = relative_url_dict,
+        **kwargs
+    )
+
+def _cuda_wheel_impl(repository_ctx):
+    cuda_version = _get_env_var(repository_ctx, "TF_CUDA_VERSION")
+    if cuda_version in ["12", "12.3"]:
+        cuda_version = _DEFAULT_CUDA_VERSION
+    if cuda_version:
+        # Download archive only when GPU config is used.
+        arch = _OS_ARCH_DICT[repository_ctx.os.arch]
+        dict_key = "{cuda_version}-{arch}".format(
+            cuda_version = cuda_version,
+            arch = arch,
+        )
+        supported_versions = repository_ctx.attr.url_dict.keys()
+        if dict_key not in supported_versions:
+            fail(
+                ("The supported NCCL versions are {supported_versions}." +
+                 " Please provide a supported CUDA version in TF_CUDA_VERSION" +
+                 " environment variable or add NCCL distributive for" +
+                 " CUDA version={version}, OS={arch}.")
+                    .format(
+                    supported_versions = supported_versions,
+                    version = cuda_version,
+                    arch = arch,
+                ),
+            )
+        sha256 = repository_ctx.attr.sha256_dict[dict_key]
+        url = repository_ctx.attr.url_dict[dict_key]
+
+        archive_name = _get_archive_name(url, archive_suffix = ".whl")
+
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            output = archive_name + repository_ctx.attr.extension,
+            sha256 = sha256,
+        )
+        repository_ctx.extract(
+            archive = archive_name + repository_ctx.attr.extension,
+            stripPrefix = repository_ctx.attr.strip_prefix,
+        )
+
+    repository_ctx.file(
+        "BUILD",
+        repository_ctx.read(repository_ctx.attr.build_file),
+    )
+
+_cuda_wheel = repository_rule(
+    implementation = _cuda_wheel_impl,
+    attrs = {
+        "sha256_dict": attr.string_dict(mandatory = True),
+        "url_dict": attr.string_dict(mandatory = True),
+        "build_file": attr.label(),
+        "strip_prefix": attr.string(),
+        "extension": attr.string(default = ".zip"),
+    },
+    environ = ["TF_CUDA_VERSION"],
+)
+
+def cuda_wheel(name, sha256_dict, url_dict, **kwargs):
+    _cuda_wheel(
+        name = name,
+        sha256_dict = sha256_dict,
+        url_dict = url_dict,
+        **kwargs
+    )
+
+def _get_relative_url_dict(dist_info):
+    relative_url_dict = {}
+    for arch in _REDIST_ARCH_DICT.keys():
+        # CUDNN JSON might contain paths for each CUDA version.
+        if "relative_path" not in dist_info[arch]:
+            for cuda_version, data in dist_info[arch].items():
+                relative_url_dict["{cuda_version}_{arch}" \
+                    .format(
+                    cuda_version = cuda_version,
+                    arch = _REDIST_ARCH_DICT[arch],
+                )] = [data["relative_path"], data["sha256"]]
+        else:
+            relative_url_dict[_REDIST_ARCH_DICT[arch]] = [
+                dist_info[arch]["relative_path"],
+                dist_info[arch]["sha256"],
+            ]
+    return relative_url_dict
+
+def _get_cuda_archive(
+        repo_name,
+        dist_dict,
+        dist_name,
+        build_file = None,
+        build_template = None,
+        is_cudnn_dist = False):
+    if dist_name in dist_dict.keys():
+        return cuda_http_archive(
+            name = repo_name,
+            dist_version = dist_dict[dist_name]["version"],
+            build_file = Label(build_file) if build_file else None,
+            build_template = Label(build_template) if build_template else None,
+            relative_url_dict = _get_relative_url_dict(dist_dict[dist_name]),
+            is_cudnn_dist = is_cudnn_dist,
+        )
+    else:
+        return cuda_http_archive(
+            name = repo_name,
+            dist_version = "",
+            build_file = Label(build_file) if build_file else None,
+            build_template = Label(build_template) if build_template else None,
+            relative_url_dict = {"": []},
+            is_cudnn_dist = is_cudnn_dist,
+        )
+
+def cuda_distributives(cuda_nccl_wheel_dict):
+    nccl_artifacts_dict = {"sha256_dict": {}, "url_dict": {}}
+    for cuda_version, nccl_wheel_info in cuda_nccl_wheel_dict.items():
+        for arch in _OS_ARCH_DICT.values():
+            if arch in nccl_wheel_info.keys():
+                cuda_version_to_arch_key = "%s-%s" % (cuda_version, arch)
+                nccl_artifacts_dict["sha256_dict"][cuda_version_to_arch_key] = nccl_wheel_info[arch]["sha256"]
+                nccl_artifacts_dict["url_dict"][cuda_version_to_arch_key] = nccl_wheel_info[arch]["url"]
+
+    cuda_wheel(
+        name = "cuda_nccl",
+        sha256_dict = nccl_artifacts_dict["sha256_dict"],
+        url_dict = nccl_artifacts_dict["url_dict"],
+        build_file = Label("//third_party/gpus/cuda:cuda_nccl.BUILD"),
+        strip_prefix = "nvidia/nccl",
+    )
+
+    cuda_distributives = get_cuda_distributives()
+    cudnn_distributives = get_cudnn_distributives()
+
+    _get_cuda_archive(
+        repo_name = "cuda_cccl",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_cccl",
+        build_file = "//third_party/gpus/cuda:cuda_cccl.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cublas",
+        dist_dict = cuda_distributives,
+        dist_name = "libcublas",
+        build_template = "//third_party/gpus/cuda:cuda_cublas.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cudart",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_cudart",
+        build_template = "//third_party/gpus/cuda:cuda_cudart.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cudnn",
+        dist_dict = cudnn_distributives,
+        dist_name = "cudnn",
+        build_template = "//third_party/gpus/cuda:cuda_cudnn.BUILD.tpl",
+        is_cudnn_dist = True,
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cufft",
+        dist_dict = cuda_distributives,
+        dist_name = "libcufft",
+        build_template = "//third_party/gpus/cuda:cuda_cufft.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cupti",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_cupti",
+        build_template = "//third_party/gpus/cuda:cuda_cupti.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_curand",
+        dist_dict = cuda_distributives,
+        dist_name = "libcurand",
+        build_template = "//third_party/gpus/cuda:cuda_curand.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cusolver",
+        dist_dict = cuda_distributives,
+        dist_name = "libcusolver",
+        build_template = "//third_party/gpus/cuda:cuda_cusolver.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_cusparse",
+        dist_dict = cuda_distributives,
+        dist_name = "libcusparse",
+        build_template = "//third_party/gpus/cuda:cuda_cusparse.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvcc",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvcc",
+        build_file = "//third_party/gpus/cuda:cuda_nvcc.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvjitlink",
+        dist_dict = cuda_distributives,
+        dist_name = "libnvjitlink",
+        build_template = "//third_party/gpus/cuda:cuda_nvjitlink.BUILD.tpl",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvml",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvml_dev",
+        build_file = "//third_party/gpus/cuda:cuda_nvml.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvprune",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvprune",
+        build_file = "//third_party/gpus/cuda:cuda_nvprune.BUILD",
+    )
+    _get_cuda_archive(
+        repo_name = "cuda_nvtx",
+        dist_dict = cuda_distributives,
+        dist_name = "cuda_nvtx",
+        build_file = "//third_party/gpus/cuda:cuda_nvtx.BUILD",
+    )
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/compiler_common_tools.bzl b/third_party/xla/third_party/tsl/third_party/gpus/compiler_common_tools.bzl
new file mode 100644
index 00000000000000..bd07f49ec457bb
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/compiler_common_tools.bzl
@@ -0,0 +1,174 @@
+"""Common compiler functions. """
+
+load(
+    "//third_party/remote_config:common.bzl",
+    "err_out",
+    "raw_exec",
+    "realpath",
+)
+
+def to_list_of_strings(elements):
+    """Convert the list of ["a", "b", "c"] into '"a", "b", "c"'.
+
+    This is to be used to put a list of strings into the bzl file templates
+    so it gets interpreted as list of strings in Starlark.
+
+    Args:
+      elements: list of string elements
+
+    Returns:
+      single string of elements wrapped in quotes separated by a comma."""
+    quoted_strings = ["\"" + element + "\"" for element in elements]
+    return ", ".join(quoted_strings)
+
+_INC_DIR_MARKER_BEGIN = "#include <...>"
+
+# OSX add " (framework directory)" at the end of line, strip it.
+_OSX_FRAMEWORK_SUFFIX = " (framework directory)"
+_OSX_FRAMEWORK_SUFFIX_LEN = len(_OSX_FRAMEWORK_SUFFIX)
+
+# TODO(dzc): Once these functions have been factored out of Bazel's
+# cc_configure.bzl, load them from @bazel_tools instead.
+def _cxx_inc_convert(path):
+    """Convert path returned by cc -E xc++ in a complete path."""
+    path = path.strip()
+    if path.endswith(_OSX_FRAMEWORK_SUFFIX):
+        path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
+    return path
+
+def _normalize_include_path(repository_ctx, path):
+    """Normalizes include paths before writing them to the crosstool.
+
+      If path points inside the 'crosstool' folder of the repository, a relative
+      path is returned.
+      If path points outside the 'crosstool' folder, an absolute path is returned.
+    """
+    path = str(repository_ctx.path(path))
+    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
+
+    if path.startswith(crosstool_folder):
+        # We drop the path to "$REPO/crosstool" and a trailing path separator.
+        return path[len(crosstool_folder) + 1:]
+    return path
+
+def _is_compiler_option_supported(repository_ctx, cc, option):
+    """Checks that `option` is supported by the C compiler. Doesn't %-escape the option."""
+    result = repository_ctx.execute([
+        cc,
+        option,
+        "-o",
+        "/dev/null",
+        "-c",
+        str(repository_ctx.path("tools/cpp/empty.cc")),
+    ])
+    return result.stderr.find(option) == -1
+
+def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp, tf_sys_root):
+    """Compute the list of default C or C++ include directories."""
+    if lang_is_cpp:
+        lang = "c++"
+    else:
+        lang = "c"
+    sysroot = []
+    if tf_sys_root:
+        sysroot += ["--sysroot", tf_sys_root]
+    result = raw_exec(repository_ctx, [cc, "-E", "-x" + lang, "-", "-v"] +
+                                      sysroot)
+    stderr = err_out(result)
+    index1 = stderr.find(_INC_DIR_MARKER_BEGIN)
+    if index1 == -1:
+        return []
+    index1 = stderr.find("\n", index1)
+    if index1 == -1:
+        return []
+    index2 = stderr.rfind("\n ")
+    if index2 == -1 or index2 < index1:
+        return []
+    index2 = stderr.find("\n", index2 + 1)
+    if index2 == -1:
+        inc_dirs = stderr[index1 + 1:]
+    else:
+        inc_dirs = stderr[index1 + 1:index2].strip()
+
+    print_resource_dir_supported = _is_compiler_option_supported(
+        repository_ctx,
+        cc,
+        "-print-resource-dir",
+    )
+
+    if print_resource_dir_supported:
+        resource_dir = repository_ctx.execute(
+            [cc, "-print-resource-dir"],
+        ).stdout.strip() + "/share"
+        inc_dirs += "\n" + resource_dir
+
+    compiler_includes = [
+        _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
+        for p in inc_dirs.split("\n")
+    ]
+
+    # The compiler might be on a symlink, e.g. /symlink -> /opt/gcc
+    # The above keeps only the resolved paths to the default includes (e.g. /opt/gcc/include/c++/11)
+    # but Bazel might encounter either (usually reported by the compiler)
+    # especially when a compiler wrapper (e.g. ccache) is used.
+    # So we need to also include paths where symlinks are not resolved.
+
+    # Try to find real path to CC installation to "see through" compiler wrappers
+    # GCC has the path to g++
+    index1 = result.stderr.find("COLLECT_GCC=")
+    if index1 != -1:
+        index1 = result.stderr.find("=", index1)
+        index2 = result.stderr.find("\n", index1)
+        cc_topdir = repository_ctx.path(result.stderr[index1 + 1:index2]).dirname.dirname
+    else:
+        # Clang has the directory
+        index1 = result.stderr.find("InstalledDir: ")
+        if index1 != -1:
+            index1 = result.stderr.find(" ", index1)
+            index2 = result.stderr.find("\n", index1)
+            cc_topdir = repository_ctx.path(result.stderr[index1 + 1:index2]).dirname
+        else:
+            # Fallback to the CC path
+            cc_topdir = repository_ctx.path(cc).dirname.dirname
+
+    # We now have the compiler installation prefix, e.g. /symlink/gcc
+    # And the resolved installation prefix, e.g. /opt/gcc
+    cc_topdir_resolved = str(realpath(repository_ctx, cc_topdir)).strip()
+    cc_topdir = str(cc_topdir).strip()
+
+    # If there is (any!) symlink involved we add paths where the unresolved installation prefix is kept.
+    # e.g. [/opt/gcc/include/c++/11, /opt/gcc/lib/gcc/x86_64-linux-gnu/11/include, /other/path]
+    # adds [/symlink/include/c++/11, /symlink/lib/gcc/x86_64-linux-gnu/11/include]
+    if cc_topdir_resolved != cc_topdir:
+        unresolved_compiler_includes = [
+            cc_topdir + inc[len(cc_topdir_resolved):]
+            for inc in compiler_includes
+            if inc.startswith(cc_topdir_resolved)
+        ]
+        compiler_includes = compiler_includes + unresolved_compiler_includes
+    return compiler_includes
+
+def get_cxx_inc_directories(repository_ctx, cc, tf_sys_root):
+    """Compute the list of default C and C++ include directories."""
+
+    # For some reason `clang -xc` sometimes returns include paths that are
+    # different from the ones from `clang -xc++`. (Symlink and a dir)
+    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
+    includes_cpp = _get_cxx_inc_directories_impl(
+        repository_ctx,
+        cc,
+        True,
+        tf_sys_root,
+    )
+    includes_c = _get_cxx_inc_directories_impl(
+        repository_ctx,
+        cc,
+        False,
+        tf_sys_root,
+    )
+
+    return includes_cpp + [
+        inc
+        for inc in includes_c
+        if inc not in includes_cpp
+    ]
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/crosstool/BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/crosstool/BUILD.tpl
index 8eda7a1cf6ac2b..b9553d9b99ecfe 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/crosstool/BUILD.tpl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/crosstool/BUILD.tpl
@@ -2,6 +2,7 @@
 # Update cuda_configure.bzl#verify_build_defines when adding new variables.
 
 load(":cc_toolchain_config.bzl", "cc_toolchain_config")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
 licenses(["restricted"])
 
@@ -133,9 +134,17 @@ filegroup(
     srcs = [],
 )
 
+filegroup(
+    name = "cuda_nvcc_files",
+    srcs = %{cuda_nvcc_files},
+)
+
 filegroup(
     name = "crosstool_wrapper_driver_is_not_gcc",
-    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
+    srcs = [
+      ":cuda_nvcc_files",
+      ":clang/bin/crosstool_wrapper_driver_is_not_gcc"
+    ],
 )
 
 filegroup(
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/BUILD.hermetic.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/BUILD.hermetic.tpl
new file mode 100644
index 00000000000000..1c00f1c5e32916
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/BUILD.hermetic.tpl
@@ -0,0 +1,291 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("@bazel_skylib//lib:selects.bzl", "selects")
+load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
+
+licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+
+package(default_visibility = ["//visibility:public"])
+
+# Config setting whether TensorFlow is built with CUDA support using clang.
+#
+# TODO(b/174244321), DEPRECATED: this target will be removed when all users
+# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_clang.
+selects.config_setting_group(
+    name = "using_clang",
+    match_all = [
+        "@local_config_cuda//:is_cuda_enabled",
+        "@local_config_cuda//:is_cuda_compiler_clang",
+    ],
+)
+
+# Config setting whether TensorFlow is built with CUDA support using nvcc.
+#
+# TODO(b/174244321), DEPRECATED: this target will be removed when all users
+# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_nvcc.
+selects.config_setting_group(
+    name = "using_nvcc",
+    match_all = [
+        "@local_config_cuda//:is_cuda_enabled",
+        "@local_config_cuda//:is_cuda_compiler_nvcc",
+    ],
+)
+
+# Equivalent to using_clang && -c opt.
+selects.config_setting_group(
+    name = "using_clang_opt",
+    match_all = [
+        ":using_clang",
+        ":_opt",
+    ],
+)
+
+config_setting(
+    name = "_opt",
+    values = {"compilation_mode": "opt"},
+)
+
+# Provides CUDA headers for '#include "third_party/gpus/cuda/include/cuda.h"'
+# All clients including TensorFlow should use these directives.
+cc_library(
+    name = "cuda_headers",
+    hdrs = [
+        "cuda/cuda_config.h",
+    ],
+    include_prefix = "third_party/gpus",
+    includes = [
+        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
+    ],
+    deps = [":cudart_headers",
+            ":cublas_headers",
+            ":cccl_headers",
+            ":nvtx_headers",
+            ":nvcc_headers",
+            ":nvjitlink_headers",
+            ":cusolver_headers",
+            ":cufft_headers",
+            ":cusparse_headers",
+            ":curand_headers",
+            ":cupti_headers",
+            ":nvml_headers"],
+)
+
+cc_library(
+    name = "cudart_static",
+    srcs = ["@cuda_cudart//:static"],
+    linkopts = [
+        "-ldl",
+        "-lpthread",
+        %{cudart_static_linkopt}
+    ],
+)
+
+alias(
+  name = "cuda_driver",
+  actual = "@cuda_cudart//:cuda_driver",
+)
+
+alias(
+  name = "cudart_headers",
+  actual = "@cuda_cudart//:headers",
+)
+
+alias(
+  name = "cudart",
+  actual = "@cuda_cudart//:cudart",
+)
+
+alias(
+  name = "nvjitlink_headers",
+  actual = "@cuda_nvjitlink//:headers",
+)
+
+alias(
+  name = "nvjitlink",
+  actual = "@cuda_nvjitlink//:nvjitlink",
+)
+
+alias(
+  name = "nvtx_headers",
+  actual = "@cuda_nvtx//:headers",
+)
+
+alias(
+  name = "nvml_headers",
+  actual = "@cuda_nvml//:headers",
+)
+
+alias(
+  name = "nvcc_headers",
+  actual = "@cuda_nvcc//:headers",
+)
+
+alias(
+  name = "cccl_headers",
+  actual = "@cuda_cccl//:headers",
+)
+
+alias(
+  name = "cublas_headers",
+  actual = "@cuda_cublas//:headers",
+)
+
+alias(
+  name = "cusolver_headers",
+  actual = "@cuda_cusolver//:headers",
+)
+
+alias(
+  name = "cufft_headers",
+  actual = "@cuda_cufft//:headers",
+)
+
+alias(
+  name = "cusparse_headers",
+  actual = "@cuda_cusparse//:headers",
+)
+
+alias(
+  name = "curand_headers",
+  actual = "@cuda_curand//:headers",
+)
+
+alias(
+  name = "cublas",
+  actual = "@cuda_cublas//:cublas",
+)
+
+alias(
+  name = "cublasLt",
+  actual = "@cuda_cublas//:cublasLt",
+)
+
+alias(
+  name = "cusolver",
+  actual = "@cuda_cusolver//:cusolver",
+)
+
+alias(
+  name = "cudnn",
+  actual = "@cuda_cudnn//:cudnn",
+)
+
+alias(
+  name = "cudnn_ops_infer",
+  actual = "@cuda_cudnn//:cudnn_ops_infer",
+)
+
+alias(
+  name = "cudnn_cnn_infer",
+  actual = "@cuda_cudnn//:cudnn_cnn_infer",
+)
+
+alias(
+  name = "cudnn_ops_train",
+  actual = "@cuda_cudnn//:cudnn_ops_train",
+)
+
+alias(
+  name = "cudnn_cnn_train",
+  actual = "@cuda_cudnn//:cudnn_cnn_train",
+)
+
+alias(
+  name = "cudnn_adv_infer",
+  actual = "@cuda_cudnn//:cudnn_adv_infer",
+)
+
+alias(
+  name = "cudnn_adv_train",
+  actual = "@cuda_cudnn//:cudnn_adv_train",
+)
+alias(
+  name = "cudnn_header",
+  actual = "@cuda_cudnn//:headers",
+)
+
+alias(
+  name = "cufft",
+  actual = "@cuda_cufft//:cufft",
+)
+
+alias(
+  name = "curand",
+  actual = "@cuda_curand//:curand",
+)
+
+cc_library(
+    name = "cuda",
+    deps = [
+        ":cublas",
+        ":cublasLt",
+        ":cuda_headers",
+        ":cudart",
+        ":cudnn",
+        ":cufft",
+        ":curand",
+    ],
+)
+
+alias(
+    name = "cub_headers",
+    actual = "%{cub_actual}",
+)
+
+alias(
+  name = "cupti_headers",
+  actual = "@cuda_cupti//:headers",
+)
+
+alias(
+  name = "cupti_dsos",
+  actual = "@cuda_cupti//:cupti",
+)
+
+alias(
+  name = "cusparse",
+  actual = "@cuda_cusparse//:cusparse",
+)
+
+cc_library(
+    name = "libdevice_root",
+    data = ["@cuda_nvcc//:nvvm"],
+)
+
+bzl_library(
+    name = "build_defs_bzl",
+    srcs = ["build_defs.bzl"],
+    deps = [
+        "@bazel_skylib//lib:selects",
+    ],
+)
+
+py_library(
+    name = "cuda_config_py",
+    srcs = ["cuda/cuda_config.py"],
+)
+
+# Config setting whether TensorFlow is built with hermetic CUDA.
+alias(
+    name = "hermetic_cuda_tools",
+    actual = "@local_config_cuda//:is_cuda_enabled",
+)
+
+# Flag indicating if we should include hermetic CUDA libs.
+bool_flag(
+    name = "include_hermetic_cuda_libs",
+    build_setting_default = True,
+)
+
+config_setting(
+    name = "hermetic_cuda_libs",
+    flag_values = {":include_hermetic_cuda_libs": "True"},
+)
+
+selects.config_setting_group(
+    name = "hermetic_cuda_tools_and_libs",
+    match_all = [
+        ":hermetic_cuda_libs",
+        ":hermetic_cuda_tools"
+    ],
+)
+
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/BUILD.tpl
index 90a18b90de048c..a4264cc14890e5 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/BUILD.tpl
@@ -1,6 +1,7 @@
 load(":build_defs.bzl", "cuda_header_library")
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("@bazel_skylib//lib:selects.bzl", "selects")
+load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 
 licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
 
@@ -144,7 +145,6 @@ cc_library(
     name = "cusolver",
     srcs = ["cuda/lib/%{cusolver_lib}"],
     data = ["cuda/lib/%{cusolver_lib}"],
-    linkopts = ["-lgomp"],
     linkstatic = 1,
 )
 
@@ -220,7 +220,6 @@ cc_library(
     name = "cusparse",
     srcs = ["cuda/lib/%{cusparse_lib}"],
     data = ["cuda/lib/%{cusparse_lib}"],
-    linkopts = ["-lgomp"],
     linkstatic = 1,
 )
 
@@ -242,4 +241,29 @@ py_library(
     srcs = ["cuda/cuda_config.py"],
 )
 
+# Config setting whether TensorFlow is built with hermetic CUDA.
+alias(
+    name = "hermetic_cuda_tools",
+    actual = "@local_config_cuda//:is_cuda_enabled",
+)
+
+# Flag indicating if we should include hermetic CUDA libs.
+bool_flag(
+    name = "include_hermetic_cuda_libs",
+    build_setting_default = False,
+)
+
+config_setting(
+    name = "hermetic_cuda_libs",
+    flag_values = {":include_hermetic_cuda_libs": "True"},
+)
+
+selects.config_setting_group(
+    name = "hermetic_cuda_tools_and_libs",
+    match_all = [
+        ":hermetic_cuda_libs",
+        ":hermetic_cuda_tools"
+    ],
+)
+
 %{copy_rules}
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cccl.BUILD b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cccl.BUILD
new file mode 100644
index 00000000000000..9823f1d871ed53
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cccl.BUILD
@@ -0,0 +1,12 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cublas.BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cublas.BUILD.tpl
new file mode 100644
index 00000000000000..d5766c971a50ff
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cublas.BUILD.tpl
@@ -0,0 +1,33 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cublas",
+    hdrs = [":headers"],
+    shared_library = "lib/libcublas.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cublasLt",
+    hdrs = [":headers"],
+    shared_library = "lib/libcublasLt.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = [
+      "include/cublas.h", 
+      "include/cublas_v2.h", 
+      "include/cublas_api.h", 
+      "include/cublasLt.h"
+    ],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cudart.BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cudart.BUILD.tpl
new file mode 100644
index 00000000000000..08655e7819156c
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cudart.BUILD.tpl
@@ -0,0 +1,34 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+filegroup(
+    name = "static",
+    srcs = ["lib/libcudart_static.a"],
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
+
+cc_import(
+    name = "cuda_driver",
+    shared_library = "lib/stubs/libcuda.so",
+)
+
+cc_import(
+    name = "cudart",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudart.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cudnn.BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cudnn.BUILD.tpl
new file mode 100644
index 00000000000000..98da6e69cbe644
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cudnn.BUILD.tpl
@@ -0,0 +1,65 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cudnn",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_ops_infer",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_ops_infer.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_cnn_infer",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_cnn_infer.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_ops_train",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_ops_train.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_cnn_train",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_cnn_train.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_adv_infer",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_adv_infer.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn_adv_train",
+    hdrs = [":headers"],
+    shared_library = "lib/libcudnn_adv_train.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cudnn",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cufft.BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cufft.BUILD.tpl
new file mode 100644
index 00000000000000..6836814dc9b622
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cufft.BUILD.tpl
@@ -0,0 +1,23 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cufft",
+    hdrs = [":headers"],
+    shared_library = "lib/libcufft.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cupti.BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cupti.BUILD.tpl
new file mode 100644
index 00000000000000..772386d723649f
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cupti.BUILD.tpl
@@ -0,0 +1,23 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cupti",
+    hdrs = [":headers"],
+    shared_library = "lib/libcupti.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/extras/CUPTI/include",
+    includes = ["include/"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_curand.BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_curand.BUILD.tpl
new file mode 100644
index 00000000000000..c98ded26f4b907
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_curand.BUILD.tpl
@@ -0,0 +1,23 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "curand",
+    hdrs = [":headers"],
+    shared_library = "lib/libcurand.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cusolver.BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cusolver.BUILD.tpl
new file mode 100644
index 00000000000000..6a5f9d9737cfe2
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cusolver.BUILD.tpl
@@ -0,0 +1,25 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cusolver",
+    hdrs = [":headers"],
+    shared_library = "lib/libcusolver.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = [
+      "include/cusolver_common.h", 
+      "include/cusolverDn.h", 
+      "include/cusolverSp.h"
+    ],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cusparse.BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cusparse.BUILD.tpl
new file mode 100644
index 00000000000000..ad5c2b5f0c45c1
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_cusparse.BUILD.tpl
@@ -0,0 +1,23 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "cusparse",
+    hdrs = [":headers"],
+    shared_library = "lib/libcusparse.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nccl.BUILD b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nccl.BUILD
new file mode 100644
index 00000000000000..440b31c5cb616e
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nccl.BUILD
@@ -0,0 +1,7 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+cc_import(
+    name = "nccl",
+    shared_library = "lib/libnccl.so.2",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvcc.BUILD b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvcc.BUILD
new file mode 100644
index 00000000000000..6cdaca5cc902a0
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvcc.BUILD
@@ -0,0 +1,73 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "bin/nvcc",
+])
+
+filegroup(
+    name = "nvvm",
+    srcs = [
+        "nvvm/libdevice/libdevice.10.bc",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "nvlink",
+    srcs = [
+        "bin/nvlink",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "fatbinary",
+    srcs = [
+        "bin/fatbinary",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "bin2c",
+    srcs = [
+        "bin/bin2c",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "ptxas",
+    srcs = [
+        "bin/ptxas",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "bin",
+    srcs = glob([
+        "bin/**",
+        "nvvm/bin/**",
+    ]),
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "link_stub",
+    srcs = [
+        "bin/crt/link.stub",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvjitlink.BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvjitlink.BUILD.tpl
new file mode 100644
index 00000000000000..6729b7cd1df9c4
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvjitlink.BUILD.tpl
@@ -0,0 +1,23 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+exports_files([
+    "version.txt",
+])
+
+cc_import(
+    name = "nvjitlink",
+    hdrs = [":headers"],
+    shared_library = "lib/libnvJitLink.so.%{version}",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvml.BUILD b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvml.BUILD
new file mode 100644
index 00000000000000..40b97e671cf7de
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvml.BUILD
@@ -0,0 +1,12 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/nvml/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvprune.BUILD b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvprune.BUILD
new file mode 100644
index 00000000000000..986ef0c8f76166
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvprune.BUILD
@@ -0,0 +1,9 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+filegroup(
+    name = "nvprune",
+    srcs = [
+        "bin/nvprune",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvtx.BUILD b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvtx.BUILD
new file mode 100644
index 00000000000000..9823f1d871ed53
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_nvtx.BUILD
@@ -0,0 +1,12 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+cc_library(
+    name = "headers",
+    hdrs = glob([
+        "include/**",
+    ]),
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["include"],
+    strip_include_prefix = "include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl b/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl
index fefbf081c87e1c..b8aad7ed4994ee 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl
@@ -53,6 +53,11 @@ load(
     "realpath",
     "which",
 )
+load(
+    ":compiler_common_tools.bzl",
+    "get_cxx_inc_directories",
+    "to_list_of_strings",
+)
 
 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
 _GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
@@ -67,20 +72,6 @@ _TF_CUDA_CONFIG_REPO = "TF_CUDA_CONFIG_REPO"
 _TF_DOWNLOAD_CLANG = "TF_DOWNLOAD_CLANG"
 _PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
 
-def to_list_of_strings(elements):
-    """Convert the list of ["a", "b", "c"] into '"a", "b", "c"'.
-
-    This is to be used to put a list of strings into the bzl file templates
-    so it gets interpreted as list of strings in Starlark.
-
-    Args:
-      elements: list of string elements
-
-    Returns:
-      single string of elements wrapped in quotes separated by a comma."""
-    quoted_strings = ["\"" + element + "\"" for element in elements]
-    return ", ".join(quoted_strings)
-
 def verify_build_defines(params):
     """Verify all variables that crosstool/BUILD.tpl expects are substituted.
 
@@ -238,156 +229,6 @@ def find_cc(repository_ctx, use_cuda_clang):
               " environment variable").format(target_cc_name, cc_path_envvar))
     return cc
 
-_INC_DIR_MARKER_BEGIN = "#include <...>"
-
-# OSX add " (framework directory)" at the end of line, strip it.
-_OSX_FRAMEWORK_SUFFIX = " (framework directory)"
-_OSX_FRAMEWORK_SUFFIX_LEN = len(_OSX_FRAMEWORK_SUFFIX)
-
-def _cxx_inc_convert(path):
-    """Convert path returned by cc -E xc++ in a complete path."""
-    path = path.strip()
-    if path.endswith(_OSX_FRAMEWORK_SUFFIX):
-        path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
-    return path
-
-def _normalize_include_path(repository_ctx, path):
-    """Normalizes include paths before writing them to the crosstool.
-
-      If path points inside the 'crosstool' folder of the repository, a relative
-      path is returned.
-      If path points outside the 'crosstool' folder, an absolute path is returned.
-      """
-    path = str(repository_ctx.path(path))
-    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
-
-    if path.startswith(crosstool_folder):
-        # We drop the path to "$REPO/crosstool" and a trailing path separator.
-        return path[len(crosstool_folder) + 1:]
-    return path
-
-def _is_compiler_option_supported(repository_ctx, cc, option):
-    """Checks that `option` is supported by the C compiler. Doesn't %-escape the option."""
-    result = repository_ctx.execute([
-        cc,
-        option,
-        "-o",
-        "/dev/null",
-        "-c",
-        str(repository_ctx.path("tools/cpp/empty.cc")),
-    ])
-    return result.stderr.find(option) == -1
-
-def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp, tf_sysroot):
-    """Compute the list of default C or C++ include directories."""
-    if lang_is_cpp:
-        lang = "c++"
-    else:
-        lang = "c"
-    sysroot = []
-    if tf_sysroot:
-        sysroot += ["--sysroot", tf_sysroot]
-    result = raw_exec(repository_ctx, [cc, "-E", "-x" + lang, "-", "-v"] +
-                                      sysroot)
-    stderr = err_out(result)
-    index1 = stderr.find(_INC_DIR_MARKER_BEGIN)
-    if index1 == -1:
-        return []
-    index1 = stderr.find("\n", index1)
-    if index1 == -1:
-        return []
-    index2 = stderr.rfind("\n ")
-    if index2 == -1 or index2 < index1:
-        return []
-    index2 = stderr.find("\n", index2 + 1)
-    if index2 == -1:
-        inc_dirs = stderr[index1 + 1:]
-    else:
-        inc_dirs = stderr[index1 + 1:index2].strip()
-
-    print_resource_dir_supported = _is_compiler_option_supported(
-        repository_ctx,
-        cc,
-        "-print-resource-dir",
-    )
-
-    if print_resource_dir_supported:
-        resource_dir = repository_ctx.execute(
-            [cc, "-print-resource-dir"],
-        ).stdout.strip() + "/share"
-        inc_dirs += "\n" + resource_dir
-
-    compiler_includes = [
-        _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
-        for p in inc_dirs.split("\n")
-    ]
-
-    # The compiler might be on a symlink, e.g. /symlink -> /opt/gcc
-    # The above keeps only the resolved paths to the default includes (e.g. /opt/gcc/include/c++/11)
-    # but Bazel might encounter either (usually reported by the compiler)
-    # especially when a compiler wrapper (e.g. ccache) is used.
-    # So we need to also include paths where symlinks are not resolved.
-
-    # Try to find real path to CC installation to "see through" compiler wrappers
-    # GCC has the path to g++
-    index1 = result.stderr.find("COLLECT_GCC=")
-    if index1 != -1:
-        index1 = result.stderr.find("=", index1)
-        index2 = result.stderr.find("\n", index1)
-        cc_topdir = repository_ctx.path(result.stderr[index1 + 1:index2]).dirname.dirname
-    else:
-        # Clang has the directory
-        index1 = result.stderr.find("InstalledDir: ")
-        if index1 != -1:
-            index1 = result.stderr.find(" ", index1)
-            index2 = result.stderr.find("\n", index1)
-            cc_topdir = repository_ctx.path(result.stderr[index1 + 1:index2]).dirname
-        else:
-            # Fallback to the CC path
-            cc_topdir = repository_ctx.path(cc).dirname.dirname
-
-    # We now have the compiler installation prefix, e.g. /symlink/gcc
-    # And the resolved installation prefix, e.g. /opt/gcc
-    cc_topdir_resolved = str(realpath(repository_ctx, cc_topdir)).strip()
-    cc_topdir = str(cc_topdir).strip()
-
-    # If there is (any!) symlink involved we add paths where the unresolved installation prefix is kept.
-    # e.g. [/opt/gcc/include/c++/11, /opt/gcc/lib/gcc/x86_64-linux-gnu/11/include, /other/path]
-    # adds [/symlink/include/c++/11, /symlink/lib/gcc/x86_64-linux-gnu/11/include]
-    if cc_topdir_resolved != cc_topdir:
-        unresolved_compiler_includes = [
-            cc_topdir + inc[len(cc_topdir_resolved):]
-            for inc in compiler_includes
-            if inc.startswith(cc_topdir_resolved)
-        ]
-        compiler_includes = compiler_includes + unresolved_compiler_includes
-    return compiler_includes
-
-def get_cxx_inc_directories(repository_ctx, cc, tf_sysroot):
-    """Compute the list of default C and C++ include directories."""
-
-    # For some reason `clang -xc` sometimes returns include paths that are
-    # different from the ones from `clang -xc++`. (Symlink and a dir)
-    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
-    includes_cpp = _get_cxx_inc_directories_impl(
-        repository_ctx,
-        cc,
-        True,
-        tf_sysroot,
-    )
-    includes_c = _get_cxx_inc_directories_impl(
-        repository_ctx,
-        cc,
-        False,
-        tf_sysroot,
-    )
-
-    return includes_cpp + [
-        inc
-        for inc in includes_c
-        if inc not in includes_cpp
-    ]
-
 def auto_configure_fail(msg):
     """Output failure message when cuda configuration fails."""
     red = "\033[0;31m"
@@ -1293,6 +1134,7 @@ def _create_local_cuda_repository(repository_ctx):
 
     cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
     cuda_defines["%{unfiltered_compile_flags}"] = ""
+    cuda_defines["%{cuda_nvcc_files}"] = "[]"
     if is_cuda_clang and not is_nvcc_and_clang:
         cuda_defines["%{host_compiler_path}"] = str(cc)
         cuda_defines["%{host_compiler_warnings}"] = """
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/hermetic_cuda_configure.bzl b/third_party/xla/third_party/tsl/third_party/gpus/hermetic_cuda_configure.bzl
new file mode 100644
index 00000000000000..5d16aa6f76a1f4
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/hermetic_cuda_configure.bzl
@@ -0,0 +1,570 @@
+"""Repository rule for hermetic CUDA autoconfiguration.
+
+`hermetic_cuda_configure` depends on the following environment variables:
+
+  * `TF_NEED_CUDA`: Whether to enable building with CUDA.
+  * `TF_NVCC_CLANG`: Whether to use clang for C++ and NVCC for Cuda compilation.
+  * `CLANG_CUDA_COMPILER_PATH`: The clang compiler path that will be used for
+    both host and device code compilation.
+  * `TF_SYSROOT`: The sysroot to use when compiling.
+  * `TF_CUDA_VERSION`: The version of the CUDA toolkit (mandatory).
+  * `TF_CUDA_COMPUTE_CAPABILITIES`: The CUDA compute capabilities. Default is
+    `3.5,5.2`.
+  * `PYTHON_BIN_PATH`: The python binary path
+"""
+
+load(
+    "//third_party/remote_config:common.bzl",
+    "get_cpu_value",
+    "get_host_environ",
+    "which",
+)
+load(
+    ":compiler_common_tools.bzl",
+    "get_cxx_inc_directories",
+    "to_list_of_strings",
+)
+
+def _find_cc(repository_ctx):
+    """Find the C++ compiler."""
+    cc_path_envvar = _CLANG_CUDA_COMPILER_PATH
+    cc_name = "clang"
+
+    cc_name_from_env = get_host_environ(repository_ctx, cc_path_envvar)
+    if cc_name_from_env:
+        cc_name = cc_name_from_env
+    if cc_name.startswith("/"):
+        # Return the absolute path.
+        return cc_name
+    cc = which(repository_ctx, cc_name)
+    if cc == None:
+        fail(("Cannot find {}, either correct your path or set the {}" +
+              " environment variable").format(cc_name, cc_path_envvar))
+    return cc
+
+def _auto_configure_fail(msg):
+    """Output failure message when cuda configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg))
+
+def _lib_name(base_name, cpu_value, version = None, static = False):
+    """Constructs the platform-specific name of a library.
+
+    Args:
+    base_name: The name of the library, such as "cudart"
+    cpu_value: The name of the host operating system.
+    version: The version of the library.
+    static: True the library is static or False if it is a shared object.
+
+    Returns:
+    The platform-specific name of the library.
+    """
+    version = "" if not version else "." + version
+    if cpu_value in ("Linux"):
+        if static:
+            return "lib%s.a" % base_name
+        return "lib%s.so%s" % (base_name, version)
+    elif cpu_value == "Windows":
+        return "%s.lib" % base_name
+    elif cpu_value == "Darwin":
+        if static:
+            return "lib%s.a" % base_name
+        return "lib%s%s.dylib" % (base_name, version)
+    else:
+        _auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
+
+def _verify_build_defines(params):
+    """Verify all variables that crosstool/BUILD.tpl expects are substituted.
+
+    Args:
+      params: dict of variables that will be passed to the BUILD.tpl template.
+    """
+    missing = []
+    for param in [
+        "cxx_builtin_include_directories",
+        "extra_no_canonical_prefixes_flags",
+        "host_compiler_path",
+        "host_compiler_prefix",
+        "host_compiler_warnings",
+        "linker_bin_path",
+        "compiler_deps",
+        "msvc_cl_path",
+        "msvc_env_include",
+        "msvc_env_lib",
+        "msvc_env_path",
+        "msvc_env_tmp",
+        "msvc_lib_path",
+        "msvc_link_path",
+        "msvc_ml_path",
+        "unfiltered_compile_flags",
+        "win_compiler_deps",
+    ]:
+        if ("%{" + param + "}") not in params:
+            missing.append(param)
+
+    if missing:
+        _auto_configure_fail(
+            "BUILD.tpl template is missing these variables: " +
+            str(missing) +
+            ".\nWe only got: " +
+            str(params) +
+            ".",
+        )
+
+def get_cuda_version(repository_ctx):
+    return get_host_environ(repository_ctx, _TF_CUDA_VERSION)
+
+def enable_cuda(repository_ctx):
+    """Returns whether to build with CUDA support."""
+    return int(get_host_environ(repository_ctx, TF_NEED_CUDA, False))
+
+def _flag_enabled(repository_ctx, flag_name):
+    return get_host_environ(repository_ctx, flag_name) == "1"
+
+def _use_nvcc_and_clang(repository_ctx):
+    # Returns the flag if we need to use clang for C++ and NVCC for Cuda.
+    return _flag_enabled(repository_ctx, _TF_NVCC_CLANG)
+
+def _tf_sysroot(repository_ctx):
+    return get_host_environ(repository_ctx, _TF_SYSROOT, "")
+
+def _py_tmpl_dict(d):
+    return {"%{cuda_config}": str(d)}
+
+def _cudart_static_linkopt(cpu_value):
+    """Returns additional platform-specific linkopts for cudart."""
+    return "\"\"," if cpu_value == "Darwin" else "\"-lrt\","
+
+def _compute_capabilities(repository_ctx):
+    """Returns a list of strings representing cuda compute capabilities.
+
+    Args:
+      repository_ctx: the repo rule's context.
+
+    Returns:
+      list of cuda architectures to compile for. 'compute_xy' refers to
+      both PTX and SASS, 'sm_xy' refers to SASS only.
+    """
+    capabilities = get_host_environ(
+        repository_ctx,
+        _TF_CUDA_COMPUTE_CAPABILITIES,
+        "compute_35,compute_52",
+    ).split(",")
+
+    # Map old 'x.y' capabilities to 'compute_xy'.
+    if len(capabilities) > 0 and all([len(x.split(".")) == 2 for x in capabilities]):
+        # If all capabilities are in 'x.y' format, only include PTX for the
+        # highest capability.
+        cc_list = sorted([x.replace(".", "") for x in capabilities])
+        capabilities = ["sm_%s" % x for x in cc_list[:-1]] + ["compute_%s" % cc_list[-1]]
+    for i, capability in enumerate(capabilities):
+        parts = capability.split(".")
+        if len(parts) != 2:
+            continue
+        capabilities[i] = "compute_%s%s" % (parts[0], parts[1])
+
+    # Make list unique
+    capabilities = dict(zip(capabilities, capabilities)).keys()
+
+    # Validate capabilities.
+    for capability in capabilities:
+        if not capability.startswith(("compute_", "sm_")):
+            _auto_configure_fail("Invalid compute capability: %s" % capability)
+        for prefix in ["compute_", "sm_"]:
+            if not capability.startswith(prefix):
+                continue
+            if len(capability) == len(prefix) + 2 and capability[-2:].isdigit():
+                continue
+            if len(capability) == len(prefix) + 3 and capability.endswith("90a"):
+                continue
+            _auto_configure_fail("Invalid compute capability: %s" % capability)
+
+    return capabilities
+
+def _compute_cuda_extra_copts(compute_capabilities):
+    copts = ["--no-cuda-include-ptx=all"]
+    for capability in compute_capabilities:
+        if capability.startswith("compute_"):
+            capability = capability.replace("compute_", "sm_")
+            copts.append("--cuda-include-ptx=%s" % capability)
+        copts.append("--cuda-gpu-arch=%s" % capability)
+
+    return str(copts)
+
+def _get_cuda_config(repository_ctx):
+    """Detects and returns information about the CUDA installation on the system.
+
+      Args:
+        repository_ctx: The repository context.
+
+      Returns:
+        A struct containing the following fields:
+          cuda_version: The version of CUDA on the system.
+          cudart_version: The CUDA runtime version on the system.
+          cudnn_version: The version of cuDNN on the system.
+          compute_capabilities: A list of the system's CUDA compute capabilities.
+          cpu_value: The name of the host operating system.
+      """
+
+    return struct(
+        cuda_version = get_cuda_version(repository_ctx),
+        cupti_version = repository_ctx.read(repository_ctx.attr.cupti_version),
+        cudart_version = repository_ctx.read(repository_ctx.attr.cudart_version),
+        cublas_version = repository_ctx.read(repository_ctx.attr.cublas_version),
+        cusolver_version = repository_ctx.read(repository_ctx.attr.cusolver_version),
+        curand_version = repository_ctx.read(repository_ctx.attr.curand_version),
+        cufft_version = repository_ctx.read(repository_ctx.attr.cufft_version),
+        cusparse_version = repository_ctx.read(repository_ctx.attr.cusparse_version),
+        cudnn_version = repository_ctx.read(repository_ctx.attr.cudnn_version),
+        compute_capabilities = _compute_capabilities(repository_ctx),
+        cpu_value = get_cpu_value(repository_ctx),
+    )
+
+_DUMMY_CROSSTOOL_BZL_FILE = """
+def error_gpu_disabled():
+  fail("ERROR: Building with --config=cuda but TensorFlow is not configured " +
+       "to build with GPU support. Please re-run ./configure and enter 'Y' " +
+       "at the prompt to build with GPU support.")
+
+  native.genrule(
+      name = "error_gen_crosstool",
+      outs = ["CROSSTOOL"],
+      cmd = "echo 'Should not be run.' && exit 1",
+  )
+
+  native.filegroup(
+      name = "crosstool",
+      srcs = [":CROSSTOOL"],
+      output_licenses = ["unencumbered"],
+  )
+"""
+
+_DUMMY_CROSSTOOL_BUILD_FILE = """
+load("//crosstool:error_gpu_disabled.bzl", "error_gpu_disabled")
+
+error_gpu_disabled()
+"""
+
+def _create_dummy_repository(repository_ctx):
+    cpu_value = get_cpu_value(repository_ctx)
+
+    # Set up BUILD file for cuda/.
+    repository_ctx.template(
+        "cuda/build_defs.bzl",
+        repository_ctx.attr.build_defs_tpl,
+        {
+            "%{cuda_is_configured}": "False",
+            "%{cuda_extra_copts}": "[]",
+            "%{cuda_gpu_architectures}": "[]",
+            "%{cuda_version}": "0.0",
+        },
+    )
+
+    repository_ctx.template(
+        "cuda/BUILD",
+        repository_ctx.attr.dummy_cuda_build_tpl,
+        {
+            "%{cuda_driver_lib}": _lib_name("cuda", cpu_value),
+            "%{cudart_static_lib}": _lib_name(
+                "cudart_static",
+                cpu_value,
+                static = True,
+            ),
+            "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
+            "%{cudart_lib}": _lib_name("cudart", cpu_value),
+            "%{cublas_lib}": _lib_name("cublas", cpu_value),
+            "%{cublasLt_lib}": _lib_name("cublasLt", cpu_value),
+            "%{cusolver_lib}": _lib_name("cusolver", cpu_value),
+            "%{cudnn_lib}": _lib_name("cudnn", cpu_value),
+            "%{cufft_lib}": _lib_name("cufft", cpu_value),
+            "%{curand_lib}": _lib_name("curand", cpu_value),
+            "%{cupti_lib}": _lib_name("cupti", cpu_value),
+            "%{cusparse_lib}": _lib_name("cusparse", cpu_value),
+            "%{cub_actual}": ":cuda_headers",
+            "%{copy_rules}": """
+filegroup(name="cuda-include")
+filegroup(name="cublas-include")
+filegroup(name="cusolver-include")
+filegroup(name="cufft-include")
+filegroup(name="cusparse-include")
+filegroup(name="curand-include")
+filegroup(name="cudnn-include")
+""",
+        },
+    )
+
+    # Create dummy files for the CUDA toolkit since they are still required by
+    # tensorflow/tsl/platform/default/build_config:cuda.
+    repository_ctx.file("cuda/cuda/include/cuda.h")
+    repository_ctx.file("cuda/cuda/include/cublas.h")
+    repository_ctx.file("cuda/cuda/include/cudnn.h")
+    repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h")
+    repository_ctx.file("cuda/cuda/nvml/include/nvml.h")
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cuda", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudart", cpu_value))
+    repository_ctx.file(
+        "cuda/cuda/lib/%s" % _lib_name("cudart_static", cpu_value),
+    )
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cublas", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cublasLt", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cusolver", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudnn", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("curand", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cufft", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cupti", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cusparse", cpu_value))
+
+    # Set up cuda_config.h, which is used by
+    # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
+    repository_ctx.template(
+        "cuda/cuda/cuda_config.h",
+        repository_ctx.attr.cuda_config_tpl,
+        {
+            "%{cuda_version}": "",
+            "%{cudart_version}": "",
+            "%{cupti_version}": "",
+            "%{cublas_version}": "",
+            "%{cusolver_version}": "",
+            "%{curand_version}": "",
+            "%{cufft_version}": "",
+            "%{cusparse_version}": "",
+            "%{cudnn_version}": "",
+            "%{cuda_toolkit_path}": "",
+            "%{cuda_compute_capabilities}": "",
+        },
+    )
+
+    # Set up cuda_config.py, which is used by gen_build_info to provide
+    # static build environment info to the API
+    repository_ctx.template(
+        "cuda/cuda/cuda_config.py",
+        repository_ctx.attr.cuda_config_py_tpl,
+        _py_tmpl_dict({}),
+    )
+
+    # If cuda_configure is not configured to build with GPU support, and the user
+    # attempts to build with --config=cuda, add a dummy build rule to intercept
+    # this and fail with an actionable error message.
+    repository_ctx.file(
+        "crosstool/error_gpu_disabled.bzl",
+        _DUMMY_CROSSTOOL_BZL_FILE,
+    )
+    repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
+
+def _create_local_cuda_repository(repository_ctx):
+    """Creates the repository containing files set up to build with CUDA."""
+    cuda_config = _get_cuda_config(repository_ctx)
+
+    # Set up BUILD file for cuda/
+    repository_ctx.template(
+        "cuda/build_defs.bzl",
+        repository_ctx.attr.build_defs_tpl,
+        {
+            "%{cuda_is_configured}": "True",
+            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
+                cuda_config.compute_capabilities,
+            ),
+            "%{cuda_gpu_architectures}": str(cuda_config.compute_capabilities),
+            "%{cuda_version}": cuda_config.cuda_version,
+        },
+    )
+
+    repository_ctx.template(
+        "cuda/BUILD",
+        repository_ctx.attr.cuda_build_tpl,
+        {
+            "%{cudart_static_linkopt}": _cudart_static_linkopt(cuda_config.cpu_value),
+            "%{cub_actual}": ":cuda_headers",
+        },
+    )
+
+    is_nvcc_and_clang = _use_nvcc_and_clang(repository_ctx)
+    tf_sysroot = _tf_sysroot(repository_ctx)
+
+    # Set up crosstool/
+    cc = _find_cc(repository_ctx)
+    host_compiler_includes = get_cxx_inc_directories(
+        repository_ctx,
+        cc,
+        tf_sysroot,
+    )
+
+    cuda_defines = {}
+
+    # We do not support hermetic CUDA on Windows.
+    # This ensures the CROSSTOOL file parser is happy.
+    cuda_defines.update({
+        "%{msvc_env_tmp}": "msvc_not_used",
+        "%{msvc_env_path}": "msvc_not_used",
+        "%{msvc_env_include}": "msvc_not_used",
+        "%{msvc_env_lib}": "msvc_not_used",
+        "%{msvc_cl_path}": "msvc_not_used",
+        "%{msvc_ml_path}": "msvc_not_used",
+        "%{msvc_link_path}": "msvc_not_used",
+        "%{msvc_lib_path}": "msvc_not_used",
+        "%{win_compiler_deps}": ":empty",
+    })
+
+    cuda_defines["%{builtin_sysroot}"] = tf_sysroot
+    cuda_defines["%{cuda_toolkit_path}"] = repository_ctx.attr.nvcc_binary.workspace_root
+    cuda_defines["%{compiler}"] = "clang"
+    cuda_defines["%{host_compiler_prefix}"] = "/usr/bin"
+    cuda_defines["%{linker_bin_path}"] = ""
+    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
+    cuda_defines["%{unfiltered_compile_flags}"] = ""
+    cuda_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(host_compiler_includes)
+    cuda_defines["%{cuda_nvcc_files}"] = "if_cuda([\"@{nvcc_archive}//:bin\", \"@{nvcc_archive}//:nvvm\"])".format(nvcc_archive = repository_ctx.attr.nvcc_binary.repo_name)
+
+    if not is_nvcc_and_clang:
+        cuda_defines["%{host_compiler_path}"] = str(cc)
+        cuda_defines["%{host_compiler_warnings}"] = """
+        # Some parts of the codebase set -Werror and hit this warning, so
+        # switch it off for now.
+        "-Wno-invalid-partial-specialization"
+    """
+        cuda_defines["%{compiler_deps}"] = ":cuda_nvcc_files"
+        repository_ctx.file(
+            "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
+            "",
+        )
+    else:
+        cuda_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
+        cuda_defines["%{host_compiler_warnings}"] = ""
+
+        nvcc_relative_path = "%s/%s" % (repository_ctx.attr.nvcc_binary.workspace_root, repository_ctx.attr.nvcc_binary.name)
+        cuda_defines["%{compiler_deps}"] = ":crosstool_wrapper_driver_is_not_gcc"
+
+        wrapper_defines = {
+            "%{cpu_compiler}": str(cc),
+            "%{cuda_version}": cuda_config.cuda_version,
+            "%{nvcc_path}": nvcc_relative_path,
+            "%{host_compiler_path}": str(cc),
+            "%{use_clang_compiler}": "True",
+        }
+        repository_ctx.template(
+            "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
+            repository_ctx.attr.crosstool_wrapper_driver_is_not_gcc_tpl,
+            wrapper_defines,
+        )
+
+    _verify_build_defines(cuda_defines)
+
+    # Only expand template variables in the BUILD file
+    repository_ctx.template(
+        "crosstool/BUILD",
+        repository_ctx.attr.crosstool_build_tpl,
+        cuda_defines,
+    )
+
+    # No templating of cc_toolchain_config - use attributes and templatize the
+    # BUILD file.
+    repository_ctx.template(
+        "crosstool/cc_toolchain_config.bzl",
+        repository_ctx.attr.cc_toolchain_config_tpl,
+        {},
+    )
+
+    # Set up cuda_config.h, which is used by
+    # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
+    repository_ctx.template(
+        "cuda/cuda/cuda_config.h",
+        repository_ctx.attr.cuda_config_tpl,
+        {
+            "%{cuda_version}": cuda_config.cuda_version,
+            "%{cudart_version}": cuda_config.cudart_version,
+            "%{cupti_version}": cuda_config.cupti_version,
+            "%{cublas_version}": cuda_config.cublas_version,
+            "%{cusolver_version}": cuda_config.cusolver_version,
+            "%{curand_version}": cuda_config.curand_version,
+            "%{cufft_version}": cuda_config.cufft_version,
+            "%{cusparse_version}": cuda_config.cusparse_version,
+            "%{cudnn_version}": cuda_config.cudnn_version,
+            "%{cuda_toolkit_path}": "",
+            "%{cuda_compute_capabilities}": ", ".join([
+                cc.split("_")[1]
+                for cc in cuda_config.compute_capabilities
+            ]),
+        },
+    )
+
+    # Set up cuda_config.py, which is used by gen_build_info to provide
+    # static build environment info to the API
+    repository_ctx.template(
+        "cuda/cuda/cuda_config.py",
+        repository_ctx.attr.cuda_config_py_tpl,
+        _py_tmpl_dict({
+            "cuda_version": cuda_config.cuda_version,
+            "cudnn_version": cuda_config.cudnn_version,
+            "cuda_compute_capabilities": cuda_config.compute_capabilities,
+            "cpu_compiler": str(cc),
+        }),
+    )
+
+def _cuda_autoconf_impl(repository_ctx):
+    """Implementation of the cuda_autoconf repository rule."""
+    build_file = repository_ctx.attr.local_config_cuda_build_file
+
+    if not enable_cuda(repository_ctx):
+        _create_dummy_repository(repository_ctx)
+    else:
+        _create_local_cuda_repository(repository_ctx)
+
+    repository_ctx.symlink(build_file, "BUILD")
+
+_CLANG_CUDA_COMPILER_PATH = "CLANG_CUDA_COMPILER_PATH"
+_PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
+_TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
+_TF_CUDA_VERSION = "TF_CUDA_VERSION"
+TF_NEED_CUDA = "TF_NEED_CUDA"
+_TF_NVCC_CLANG = "TF_NVCC_CLANG"
+_TF_SYSROOT = "TF_SYSROOT"
+
+_ENVIRONS = [
+    _CLANG_CUDA_COMPILER_PATH,
+    TF_NEED_CUDA,
+    _TF_NVCC_CLANG,
+    _TF_CUDA_VERSION,
+    _TF_CUDA_COMPUTE_CAPABILITIES,
+    _TF_SYSROOT,
+    _PYTHON_BIN_PATH,
+    "TMP",
+    "TMPDIR",
+]
+
+hermetic_cuda_configure = repository_rule(
+    implementation = _cuda_autoconf_impl,
+    environ = _ENVIRONS,
+    attrs = {
+        "environ": attr.string_dict(),
+        "cublas_version": attr.label(default = Label("@cuda_cublas//:version.txt")),
+        "cudart_version": attr.label(default = Label("@cuda_cudart//:version.txt")),
+        "cudnn_version": attr.label(default = Label("@cuda_cudnn//:version.txt")),
+        "cufft_version": attr.label(default = Label("@cuda_cufft//:version.txt")),
+        "cupti_version": attr.label(default = Label("@cuda_cupti//:version.txt")),
+        "curand_version": attr.label(default = Label("@cuda_curand//:version.txt")),
+        "cusolver_version": attr.label(default = Label("@cuda_cusolver//:version.txt")),
+        "cusparse_version": attr.label(default = Label("@cuda_cusparse//:version.txt")),
+        "nvcc_binary": attr.label(default = Label("@cuda_nvcc//:bin/nvcc")),
+        "local_config_cuda_build_file": attr.label(default = Label("//third_party/gpus:local_config_cuda.BUILD")),
+        "build_defs_tpl": attr.label(default = Label("//third_party/gpus/cuda:build_defs.bzl.tpl")),
+        "cuda_build_tpl": attr.label(default = Label("//third_party/gpus/cuda:BUILD.hermetic.tpl")),
+        "dummy_cuda_build_tpl": attr.label(default = Label("//third_party/gpus/cuda:BUILD.tpl")),
+        "cuda_config_tpl": attr.label(default = Label("//third_party/gpus/cuda:cuda_config.h.tpl")),
+        "cuda_config_py_tpl": attr.label(default = Label("//third_party/gpus/cuda:cuda_config.py.tpl")),
+        "crosstool_wrapper_driver_is_not_gcc_tpl": attr.label(default = Label("//third_party/gpus/crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl")),
+        "crosstool_build_tpl": attr.label(default = Label("//third_party/gpus/crosstool:BUILD.tpl")),
+        "cc_toolchain_config_tpl": attr.label(default = Label("//third_party/gpus/crosstool:cc_toolchain_config.bzl.tpl")),
+    },
+)
+"""Detects and configures the hermetic CUDA toolchain.
+
+Add the following to your WORKSPACE FILE:
+
+```python
+hermetic cuda_configure(name = "local_config_cuda")
+```
+
+Args:
+  name: A unique name for this workspace rule.
+"""
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl b/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
index 0fd4019fc5bb75..cf756b452e3950 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
@@ -22,12 +22,15 @@ load(
     "realpath",
     "which",
 )
+load(
+    ":compiler_common_tools.bzl",
+    "to_list_of_strings",
+)
 load(
     ":cuda_configure.bzl",
     "enable_cuda",
     "make_copy_dir_rule",
     "make_copy_files_rule",
-    "to_list_of_strings",
 )
 load(
     ":sycl_configure.bzl",
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/sycl_configure.bzl b/third_party/xla/third_party/tsl/third_party/gpus/sycl_configure.bzl
index 05330b2fe53195..dd80694e7274f5 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/sycl_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/sycl_configure.bzl
@@ -16,11 +16,14 @@ load(
     "realpath",
     "which",
 )
+load(
+    ":compiler_common_tools.bzl",
+    "to_list_of_strings",
+)
 load(
     ":cuda_configure.bzl",
     "make_copy_dir_rule",
     "make_copy_files_rule",
-    "to_list_of_strings",
 )
 
 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
diff --git a/third_party/xla/third_party/tsl/third_party/nccl/build_defs.bzl.tpl b/third_party/xla/third_party/tsl/third_party/nccl/build_defs.bzl.tpl
index 53a6d4e1e41890..a0930df34ecec8 100644
--- a/third_party/xla/third_party/tsl/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/xla/third_party/tsl/third_party/nccl/build_defs.bzl.tpl
@@ -5,7 +5,6 @@ load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
 
 # CUDA toolkit version as tuple (e.g. '(11, 1)').
 _cuda_version = %{cuda_version}
-_cuda_clang = %{cuda_clang}
 
 def _rdc_copts():
     """Returns copts for compiling relocatable device code."""
@@ -121,25 +120,25 @@ _device_link = rule(
         "gpu_archs": attr.string_list(),
         "nvlink_args": attr.string_list(),
         "_nvlink": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/nvlink"),
+            default = Label("%{nvlink_label}"),
             allow_single_file = True,
             executable = True,
             cfg = "host",
         ),
         "_fatbinary": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/fatbinary"),
+            default = Label("%{fatbinary_label}"),
             allow_single_file = True,
             executable = True,
             cfg = "host",
         ),
         "_bin2c": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/bin2c"),
+            default = Label("%{bin2c_label}"),
             allow_single_file = True,
             executable = True,
             cfg = "host",
         ),
         "_link_stub": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/crt/link.stub"),
+            default = Label("%{link_stub_label}"),
             allow_single_file = True,
         ),
     },
@@ -189,7 +188,7 @@ _prune_relocatable_code = rule(
         "input": attr.label(mandatory = True, allow_files = True),
         "gpu_archs": attr.string_list(),
         "_nvprune": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/nvprune"),
+            default = Label("%{nvprune_label}"),
             allow_single_file = True,
             executable = True,
             cfg = "host",
diff --git a/third_party/xla/third_party/tsl/third_party/nccl/hermetic_nccl_configure.bzl b/third_party/xla/third_party/tsl/third_party/nccl/hermetic_nccl_configure.bzl
new file mode 100644
index 00000000000000..b99cbcb08db58a
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/nccl/hermetic_nccl_configure.bzl
@@ -0,0 +1,153 @@
+"""Repository rule for hermetic NCCL configuration.
+
+`hermetic_nccl_configure` depends on the following environment variables:
+
+  * `TF_NCCL_USE_STUB`: "1" if a NCCL stub that loads NCCL dynamically should
+    be used, "0" if NCCL should be linked in statically.
+
+"""
+
+load(
+    "//third_party/gpus:hermetic_cuda_configure.bzl",
+    "TF_NEED_CUDA",
+    "enable_cuda",
+    "get_cuda_version",
+)
+load(
+    "//third_party/remote_config:common.bzl",
+    "get_cpu_value",
+    "get_host_environ",
+)
+
+_TF_NCCL_USE_STUB = "TF_NCCL_USE_STUB"
+
+_NCCL_DUMMY_BUILD_CONTENT = """
+filegroup(
+  name = "LICENSE",
+  visibility = ["//visibility:public"],
+)
+
+cc_library(
+  name = "nccl",
+  visibility = ["//visibility:public"],
+)
+
+cc_library(
+  name = "nccl_config",
+  hdrs = ["nccl_config.h"],
+  include_prefix = "third_party/nccl",
+  visibility = ["//visibility:public"],
+)
+"""
+
+_NCCL_ARCHIVE_BUILD_CONTENT = """
+filegroup(
+  name = "LICENSE",
+  data = ["@nccl_archive//:LICENSE.txt"],
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl",
+  actual = "@nccl_archive//:nccl",
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl_config",
+  actual = "@nccl_archive//:nccl_config",
+  visibility = ["//visibility:public"],
+)
+"""
+
+_NCCL_ARCHIVE_STUB_BUILD_CONTENT = """
+alias(
+  name = "nccl_lib",
+  actual = "@cuda_nccl//:nccl_lib",
+)
+
+filegroup(
+  name = "LICENSE",
+  data = ["@nccl_archive//:LICENSE.txt"],
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl",
+  actual = "@nccl_archive//:nccl_via_stub",
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl_headers",
+  actual = "@nccl_archive//:nccl_headers",
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl_config",
+  actual = "@nccl_archive//:nccl_config",
+  visibility = ["//visibility:public"],
+)
+"""
+
+def _create_local_nccl_repository(repository_ctx):
+    cuda_version = get_cuda_version(repository_ctx)
+    if cuda_version == "12":
+        cuda_version = "12.3"
+    cuda_version = cuda_version.split(".")
+
+    # Alias to open source build from @nccl_archive.
+    if get_host_environ(repository_ctx, _TF_NCCL_USE_STUB, "0") == "0":
+        repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
+    else:
+        repository_ctx.file("BUILD", _NCCL_ARCHIVE_STUB_BUILD_CONTENT)
+
+    repository_ctx.template("generated_names.bzl", repository_ctx.attr.generated_names_tpl, {})
+    repository_ctx.template(
+        "build_defs.bzl",
+        repository_ctx.attr.build_defs_tpl,
+        {
+            "%{cuda_version}": "(%s, %s)" % tuple(cuda_version),
+            "%{nvlink_label}": "@cuda_nvcc//:nvlink",
+            "%{fatbinary_label}": "@cuda_nvcc//:fatbinary",
+            "%{bin2c_label}": "@cuda_nvcc//:bin2c",
+            "%{link_stub_label}": "@cuda_nvcc//:link_stub",
+            "%{nvprune_label}": "@cuda_nvprune//:nvprune",
+        },
+    )
+
+def _nccl_autoconf_impl(repository_ctx):
+    if (not enable_cuda(repository_ctx) or
+        get_cpu_value(repository_ctx) not in ("Linux", "FreeBSD")):
+        # Add a dummy build file to make bazel query happy.
+        repository_ctx.file("BUILD", _NCCL_DUMMY_BUILD_CONTENT)
+        repository_ctx.file("nccl_config.h", "#define TF_NCCL_VERSION \"\"")
+    else:
+        _create_local_nccl_repository(repository_ctx)
+
+_ENVIRONS = [
+    TF_NEED_CUDA,
+]
+
+hermetic_nccl_configure = repository_rule(
+    environ = _ENVIRONS,
+    implementation = _nccl_autoconf_impl,
+    attrs = {
+        "environ": attr.string_dict(),
+        "generated_names_tpl": attr.label(default = Label("//third_party/nccl:generated_names.bzl.tpl")),
+        "build_defs_tpl": attr.label(default = Label("//third_party/nccl:build_defs.bzl.tpl")),
+        "system_build_tpl": attr.label(default = Label("//third_party/nccl:system.BUILD.tpl")),
+    },
+)
+"""Downloads and configures the hermetic NCCL configuration.
+
+Add the following to your WORKSPACE FILE:
+
+```python
+hermetic_nccl_configure(name = "local_config_nccl")
+```
+
+Args:
+  name: A unique name for this workspace rule.
+"""
diff --git a/third_party/xla/third_party/tsl/third_party/nccl/nccl_configure.bzl b/third_party/xla/third_party/tsl/third_party/nccl/nccl_configure.bzl
index 22cf64d4771062..4da2513e03eb44 100644
--- a/third_party/xla/third_party/tsl/third_party/nccl/nccl_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/nccl/nccl_configure.bzl
@@ -8,7 +8,6 @@
     files.
   * `TF_CUDA_PATHS`: The base paths to look for CUDA and cuDNN. Default is
     `/usr/local/cuda,usr/`.
-  * `TF_CUDA_CLANG`: "1" if using Clang, "0" if using NVCC.
   * `TF_NCCL_USE_STUB`: "1" if a NCCL stub that loads NCCL dynamically should
     be used, "0" if NCCL should be linked in statically.
 
@@ -33,7 +32,6 @@ _TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
 _TF_NCCL_VERSION = "TF_NCCL_VERSION"
 _TF_NEED_CUDA = "TF_NEED_CUDA"
 _TF_CUDA_PATHS = "TF_CUDA_PATHS"
-_TF_CUDA_CLANG = "TF_CUDA_CLANG"
 _TF_NCCL_USE_STUB = "TF_NCCL_USE_STUB"
 
 _DEFINE_NCCL_MAJOR = "#define NCCL_MAJOR"
@@ -129,7 +127,11 @@ def _create_local_nccl_repository(repository_ctx):
             _label("build_defs.bzl.tpl"),
             {
                 "%{cuda_version}": "(%s, %s)" % tuple(cuda_version),
-                "%{cuda_clang}": repr(get_host_environ(repository_ctx, _TF_CUDA_CLANG)),
+                "%{nvlink_label}": "@local_config_cuda//cuda:cuda/bin/nvlink",
+                "%{fatbinary_label}": "@local_config_cuda//cuda:cuda/bin/fatbinary",
+                "%{bin2c_label}": "@local_config_cuda//cuda:cuda/bin/bin2c",
+                "%{link_stub_label}": "@local_config_cuda//cuda:cuda/bin/crt/link.stub",
+                "%{nvprune_label}": "@local_config_cuda//cuda:cuda/bin/nvprune",
             },
         )
     else:
@@ -181,7 +183,6 @@ _ENVIRONS = [
     _TF_CUDA_COMPUTE_CAPABILITIES,
     _TF_NEED_CUDA,
     _TF_CUDA_PATHS,
-    _TF_CUDA_CLANG,
 ]
 
 remote_nccl_configure = repository_rule(
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
index f2eecd61a5faf7..c105ee4544f51f 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
@@ -703,7 +703,7 @@ def initialize_rbe_configs():
             "TF_CUDNN_VERSION": "8.9",
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_SYSROOT": "/dt9",
             "TF_TENSORRT_VERSION": "8.6",
         },
@@ -742,7 +742,7 @@ def initialize_rbe_configs():
             "TF_CUDNN_VERSION": "8.9",
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_SYSROOT": "/dt9",
             "TF_TENSORRT_VERSION": "8.6",
         },
@@ -782,7 +782,7 @@ def initialize_rbe_configs():
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
             "TF_SYSROOT": "/dt9",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_TENSORRT_VERSION": "8.6",
         },
     )
@@ -820,7 +820,7 @@ def initialize_rbe_configs():
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
             "TF_SYSROOT": "/dt9",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_TENSORRT_VERSION": "8.6",
         },
     )
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/rbe_config.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/rbe_config.bzl
index 18a84d96c39f82..9ade984f45351d 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/rbe_config.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/rbe_config.bzl
@@ -1,8 +1,13 @@
 """Macro that creates external repositories for remote config."""
 
-load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
+# If you need to use non-hermetic CUDA, replace the line below with
+# load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
+load("//third_party/gpus:hermetic_cuda_configure.bzl", "hermetic_cuda_configure")
 load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
-load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
+
+# If you need to use non-hermetic CUDA, replace the line below with
+# load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
+load("//third_party/nccl:hermetic_nccl_configure.bzl", "hermetic_nccl_configure")
 load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
 load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//third_party/tensorrt:tensorrt_configure.bzl", "remote_tensorrt_configure")
@@ -42,7 +47,7 @@ def _tensorflow_rbe_config(name, compiler, python_versions, os, rocm_version = N
             "TF_CUDNN_VERSION": cudnn_version,
             "TF_CUDA_VERSION": cuda_version,
             "CUDNN_INSTALL_PATH": cudnn_install_path if cudnn_install_path != None else "/usr/lib/x86_64-linux-gnu",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_TENSORRT_VERSION": tensorrt_version if tensorrt_version != None else "",
             "TENSORRT_INSTALL_PATH": tensorrt_install_path if tensorrt_install_path != None else "/usr/lib/x86_64-linux-gnu",
             "GCC_HOST_COMPILER_PATH": compiler if not compiler.endswith("clang") else "",
@@ -58,13 +63,17 @@ def _tensorflow_rbe_config(name, compiler, python_versions, os, rocm_version = N
             "Pool": "default",
         }
 
-        remote_cuda_configure(
+        # If you need to use non-hermetic CUDA, replace the call below with
+        # remote_cuda_configure.
+        hermetic_cuda_configure(
             name = "%s_config_cuda" % name,
             environ = env,
             exec_properties = exec_properties,
         )
 
-        remote_nccl_configure(
+        # If you need to use non-hermetic CUDA, replace the call below with
+        # remote_nccl_configure.
+        hermetic_nccl_configure(
             name = "%s_config_nccl" % name,
             environ = env,
             exec_properties = exec_properties,
@@ -175,13 +184,15 @@ def sigbuild_tf_configs(name_container_map, env):
             "Pool": "default",
         }
 
-        remote_cuda_configure(
+        # TODO (ybaturina): DEPRECATED: replace with remote_cuda_configure for non-hermetic CUDA.
+        hermetic_cuda_configure(
             name = "%s_config_cuda" % name,
             environ = env,
             exec_properties = exec_properties,
         )
 
-        remote_nccl_configure(
+        # TODO (ybaturina): DEPRECATED: replace with remote_nccl_configure for non-hermetic NCCL.
+        hermetic_nccl_configure(
             name = "%s_config_nccl" % name,
             environ = env,
             exec_properties = exec_properties,
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/BUILD b/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
index ca974552eca1ab..9db8ada08122f6 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
@@ -3,6 +3,7 @@ load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "@local_xla//xla/tsl:tsl.bzl",
+    "if_hermetic_cuda_tools",
     "if_not_fuchsia",
     "if_not_windows",
     "internal_visibility",
@@ -58,6 +59,9 @@ cc_library(
     srcs = ["cuda_libdevice_path.cc"],
     hdrs = ["//tsl/platform:cuda_libdevice_path.h"],
     compatible_with = [],
+    data = if_hermetic_cuda_tools([
+        "@cuda_nvcc//:nvvm",
+    ]),
     tags = [
         "manual",
         "no_oss",
@@ -65,6 +69,7 @@ cc_library(
     ],
     deps = [
         "//tsl/platform",
+        "//tsl/platform:env",
         "//tsl/platform:logging",
         "//tsl/platform:path",
         "//tsl/platform:types",
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/cuda_libdevice_path.cc b/third_party/xla/third_party/tsl/tsl/platform/default/cuda_libdevice_path.cc
index 46321e74b5dc38..ee95a3d17fd68e 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/cuda_libdevice_path.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/cuda_libdevice_path.cc
@@ -31,6 +31,7 @@ limitations under the License.
 
 #if !defined(PLATFORM_GOOGLE)
 #include "third_party/gpus/cuda/cuda_config.h"
+#include "tsl/platform/env.h"
 #endif
 #include "tsl/platform/logging.h"
 
@@ -40,6 +41,17 @@ std::vector<std::string> CandidateCudaRoots() {
 #if !defined(PLATFORM_GOOGLE)
   auto roots = std::vector<std::string>{TF_CUDA_TOOLKIT_PATH,
                                         std::string("/usr/local/cuda")};
+  std::string runfiles_suffix = "runfiles";
+  std::string executable_path = tsl::Env::Default()->GetExecutablePath();
+  std::string cuda_nvcc_dir =
+      io::JoinPath(executable_path + "." + runfiles_suffix, "cuda_nvcc");
+  roots.emplace_back(cuda_nvcc_dir);
+  std::string runfiles_dir = tsl::Env::Default()->GetRunfilesDir();
+  std::size_t runfiles_ind = runfiles_dir.rfind(runfiles_suffix);
+  cuda_nvcc_dir = io::JoinPath(
+      runfiles_dir.substr(0, runfiles_ind + runfiles_suffix.length()),
+      "cuda_nvcc");
+  roots.emplace_back(cuda_nvcc_dir);
 
 #if defined(PLATFORM_POSIX) && !defined(__APPLE__)
   Dl_info info;
@@ -53,6 +65,9 @@ std::vector<std::string> CandidateCudaRoots() {
     // relative to the current binary for the wheel-based nvcc package.
     for (auto path : {"../nvidia/cuda_nvcc", "../../nvidia/cuda_nvcc"})
       roots.emplace_back(io::JoinPath(dir, path));
+
+    // Also add the path to the copy of libdevice.10.bc we include with XLA.
+    roots.emplace_back(io::JoinPath(dir, "cuda"));
   }
 #endif  // defined(PLATFORM_POSIX) && !defined(__APPLE__)
 
diff --git a/third_party/xla/third_party/tsl/workspace2.bzl b/third_party/xla/third_party/tsl/workspace2.bzl
index 001ce018d87066..7274961489c516 100644
--- a/third_party/xla/third_party/tsl/workspace2.bzl
+++ b/third_party/xla/third_party/tsl/workspace2.bzl
@@ -6,6 +6,7 @@ load("@bazel_skylib//lib:versions.bzl", "versions")
 # Import external repository rules.
 load("@bazel_tools//tools/build_defs/repo:java.bzl", "java_import_external")
 load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
+load("//third_party:cuda_repo.bzl", "cuda_distributives")
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 # Import third party repository rules. See go/tfbr-thirdparty.
@@ -17,14 +18,20 @@ load("//third_party/eigen3:workspace.bzl", eigen3 = "repo")
 load("//third_party/farmhash:workspace.bzl", farmhash = "repo")
 load("//third_party/gemmlowp:workspace.bzl", gemmlowp = "repo")
 load("//third_party/git:git_configure.bzl", "git_configure")
-load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
+
+# If you need to use non-hermetic CUDA, replace the line below with
+# load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
+load("//third_party/gpus:hermetic_cuda_configure.bzl", "hermetic_cuda_configure")
 load("//third_party/gpus:rocm_configure.bzl", "rocm_configure")
 load("//third_party/gpus:sycl_configure.bzl", "sycl_configure")
 load("//third_party/hwloc:workspace.bzl", hwloc = "repo")
 load("//third_party/implib_so:workspace.bzl", implib_so = "repo")
 load("//third_party/llvm:setup.bzl", "llvm_setup")
 load("//third_party/nasm:workspace.bzl", nasm = "repo")
-load("//third_party/nccl:nccl_configure.bzl", "nccl_configure")
+
+# If you need to use non-hermetic CUDA, replace the line below with
+# load("//third_party/nccl:nccl_configure.bzl", "nccl_configure")
+load("//third_party/nccl:hermetic_nccl_configure.bzl", "hermetic_nccl_configure")
 load("//third_party/py:python_configure.bzl", "python_configure")
 load("//third_party/py/ml_dtypes:workspace.bzl", ml_dtypes = "repo")
 load("//third_party/pybind11_abseil:workspace.bzl", pybind11_abseil = "repo")
@@ -69,9 +76,15 @@ def _tf_toolchains():
     # Note that we check the minimum bazel version in WORKSPACE.
     clang6_configure(name = "local_config_clang6")
     cc_download_clang_toolchain(name = "local_config_download_clang")
-    cuda_configure(name = "local_config_cuda")
+
+    # If you need to use non-hermetic CUDA, replace the line below with
+    # cuda_configure(name = "local_config_cuda")
+    hermetic_cuda_configure(name = "local_config_cuda")
     tensorrt_configure(name = "local_config_tensorrt")
-    nccl_configure(name = "local_config_nccl")
+
+    # If you need to use non-hermetic CUDA, replace the line below with
+    # nccl_configure(name = "local_config_nccl")
+    hermetic_nccl_configure(name = "local_config_nccl")
     git_configure(name = "local_config_git")
     syslibs_configure(name = "local_config_syslibs")
     python_configure(name = "local_config_python")
@@ -597,6 +610,28 @@ def _tf_repositories():
         urls = tf_mirror_urls("https://github.com/google/glog/archive/refs/tags/v0.4.0.tar.gz"),
     )
 
+_CUDA_12_3_NCCL_WHEEL_DICT = {
+    "x86_64-unknown-linux-gnu": {
+        "url": "https://files.pythonhosted.org/packages/38/00/d0d4e48aef772ad5aebcf70b73028f88db6e5640b36c38e90445b7a57c45/nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl",
+        "sha256": "a9734707a2c96443331c1e48c717024aa6678a0e2a4cb66b2c364d18cee6b48d",
+    },
+    "aarch64-unknown-linux-gnu": {
+        "url": "https://files.pythonhosted.org/packages/c1/bb/d09dda47c881f9ff504afd6f9ca4f502ded6d8fc2f572cacc5e39da91c28/nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_aarch64.whl",
+        "sha256": "1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01",
+    },
+}
+
+_CUDA_12_1_NCCL_WHEEL_DICT = {
+    "x86_64-unknown-linux-gnu": {
+        "url": "https://files.pythonhosted.org/packages/38/00/d0d4e48aef772ad5aebcf70b73028f88db6e5640b36c38e90445b7a57c45/nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl",
+        "sha256": "a9734707a2c96443331c1e48c717024aa6678a0e2a4cb66b2c364d18cee6b48d",
+    },
+    "aarch64-unknown-linux-gnu": {
+        "url": "https://files.pythonhosted.org/packages/c1/bb/d09dda47c881f9ff504afd6f9ca4f502ded6d8fc2f572cacc5e39da91c28/nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_aarch64.whl",
+        "sha256": "1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01",
+    },
+}
+
 def workspace():
     # Check the bazel version before executing any repository rules, in case
     # those rules rely on the version we require here.
@@ -614,6 +649,10 @@ def workspace():
     # don't already exist (at least if the external repository macros were
     # written according to common practice to query native.existing_rule()).
     _tf_repositories()
+    cuda_distributives(cuda_nccl_wheel_dict = {
+        "12.3.2": _CUDA_12_3_NCCL_WHEEL_DICT,
+        "12.1.1": _CUDA_12_1_NCCL_WHEEL_DICT,
+    })
 
 # Alias so it can be loaded without assigning to a different symbol to prevent
 # shadowing previous loads and trigger a buildifier warning.
diff --git a/third_party/xla/third_party/tsl/workspace3.bzl b/third_party/xla/third_party/tsl/workspace3.bzl
index a1293f59a48885..adba216bc518a1 100644
--- a/third_party/xla/third_party/tsl/workspace3.bzl
+++ b/third_party/xla/third_party/tsl/workspace3.bzl
@@ -1,8 +1,31 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("//third_party:cuda_redist_json_repo.bzl", "cuda_redist_json")
 load("//third_party/llvm:workspace.bzl", llvm = "repo")
 
+_CUDA_REDIST_JSON_DICT = {
+    "12.1.1": [
+        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.1.1.json",
+        "bafea3cb83a4cf5c764eeedcaac0040d0d3c5db3f9a74550da0e7b6ac24d378c",
+    ],
+    "12.3.2": [
+        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.3.2.json",
+        "1b6eacf335dd49803633fed53ef261d62c193e5a56eee5019e7d2f634e39e7ef",
+    ],
+}
+
+_CUDNN_REDIST_JSON_DICT = {
+    "8.6": [
+        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_8.6.0.json",
+        "7f6f50bed4fd8216dc10d6ef505771dc0ecc99cce813993ab405cb507a21d51d",
+    ],
+    "8.9.7.29": [
+        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_8.9.7.29.json",
+        "a0734f26f068522464fa09b2f2c186dfbe6ad7407a88ea0c50dd331f0c3389ec",
+    ],
+}
+
 def workspace():
     http_archive(
         name = "io_bazel_rules_closure",
@@ -46,6 +69,13 @@ def workspace():
     # but provides a script for setting up build rules via overlays.
     llvm("llvm-raw")
 
+    # Load JSON files for CUDA and cuDNN distribution versions.
+    cuda_redist_json(
+        name = "cuda_redist_json",
+        cuda_json_dict = _CUDA_REDIST_JSON_DICT,
+        cudnn_json_dict = _CUDNN_REDIST_JSON_DICT,
+    )
+
 # Alias so it can be loaded without assigning to a different symbol to prevent
 # shadowing previous loads and trigger a buildifier warning.
 tsl_workspace3 = workspace
diff --git a/third_party/xla/tools/toolchains/remote_config/configs.bzl b/third_party/xla/tools/toolchains/remote_config/configs.bzl
index f2eecd61a5faf7..c105ee4544f51f 100644
--- a/third_party/xla/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/configs.bzl
@@ -703,7 +703,7 @@ def initialize_rbe_configs():
             "TF_CUDNN_VERSION": "8.9",
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_SYSROOT": "/dt9",
             "TF_TENSORRT_VERSION": "8.6",
         },
@@ -742,7 +742,7 @@ def initialize_rbe_configs():
             "TF_CUDNN_VERSION": "8.9",
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_SYSROOT": "/dt9",
             "TF_TENSORRT_VERSION": "8.6",
         },
@@ -782,7 +782,7 @@ def initialize_rbe_configs():
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
             "TF_SYSROOT": "/dt9",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_TENSORRT_VERSION": "8.6",
         },
     )
@@ -820,7 +820,7 @@ def initialize_rbe_configs():
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
             "TF_SYSROOT": "/dt9",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_TENSORRT_VERSION": "8.6",
         },
     )
diff --git a/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl b/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl
index 18a84d96c39f82..9ade984f45351d 100644
--- a/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl
@@ -1,8 +1,13 @@
 """Macro that creates external repositories for remote config."""
 
-load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
+# If you need to use non-hermetic CUDA, replace the line below with
+# load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
+load("//third_party/gpus:hermetic_cuda_configure.bzl", "hermetic_cuda_configure")
 load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
-load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
+
+# If you need to use non-hermetic CUDA, replace the line below with
+# load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
+load("//third_party/nccl:hermetic_nccl_configure.bzl", "hermetic_nccl_configure")
 load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
 load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//third_party/tensorrt:tensorrt_configure.bzl", "remote_tensorrt_configure")
@@ -42,7 +47,7 @@ def _tensorflow_rbe_config(name, compiler, python_versions, os, rocm_version = N
             "TF_CUDNN_VERSION": cudnn_version,
             "TF_CUDA_VERSION": cuda_version,
             "CUDNN_INSTALL_PATH": cudnn_install_path if cudnn_install_path != None else "/usr/lib/x86_64-linux-gnu",
-            "TF_NEED_TENSORRT": "1",
+            "TF_NEED_TENSORRT": "0",
             "TF_TENSORRT_VERSION": tensorrt_version if tensorrt_version != None else "",
             "TENSORRT_INSTALL_PATH": tensorrt_install_path if tensorrt_install_path != None else "/usr/lib/x86_64-linux-gnu",
             "GCC_HOST_COMPILER_PATH": compiler if not compiler.endswith("clang") else "",
@@ -58,13 +63,17 @@ def _tensorflow_rbe_config(name, compiler, python_versions, os, rocm_version = N
             "Pool": "default",
         }
 
-        remote_cuda_configure(
+        # If you need to use non-hermetic CUDA, replace the call below with
+        # remote_cuda_configure.
+        hermetic_cuda_configure(
             name = "%s_config_cuda" % name,
             environ = env,
             exec_properties = exec_properties,
         )
 
-        remote_nccl_configure(
+        # If you need to use non-hermetic CUDA, replace the call below with
+        # remote_nccl_configure.
+        hermetic_nccl_configure(
             name = "%s_config_nccl" % name,
             environ = env,
             exec_properties = exec_properties,
@@ -175,13 +184,15 @@ def sigbuild_tf_configs(name_container_map, env):
             "Pool": "default",
         }
 
-        remote_cuda_configure(
+        # TODO (ybaturina): DEPRECATED: replace with remote_cuda_configure for non-hermetic CUDA.
+        hermetic_cuda_configure(
             name = "%s_config_cuda" % name,
             environ = env,
             exec_properties = exec_properties,
         )
 
-        remote_nccl_configure(
+        # TODO (ybaturina): DEPRECATED: replace with remote_nccl_configure for non-hermetic NCCL.
+        hermetic_nccl_configure(
             name = "%s_config_nccl" % name,
             environ = env,
             exec_properties = exec_properties,
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 5c8aaea5723b70..ac1d1a87becb06 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -1382,6 +1382,7 @@ cc_library(
     ]) + if_cuda_is_configured([
         "//xla/service/gpu:nvptx_compiler",
         "//xla/stream_executor/cuda:stream_executor_cuda",
+        "//xla/tsl:gpu_runtime_hermetic_cuda_deps",
     ]) + if_rocm_is_configured([
         "//xla/service/gpu:amdgpu_compiler",
         "//xla/stream_executor/rocm:stream_executor_rocm",
diff --git a/third_party/xla/xla/service/gpu/tests/add_preds.hlo b/third_party/xla/xla/service/gpu/tests/add_preds.hlo
index 120b6a5ad686bf..b106b806c0470c 100644
--- a/third_party/xla/xla/service/gpu/tests/add_preds.hlo
+++ b/third_party/xla/xla/service/gpu/tests/add_preds.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s
 
 // CHECK: define void @fusion({{.*}}%[[ARG0:.*]], {{.*}}%[[ARG1:.*]],
diff --git a/third_party/xla/xla/service/gpu/tests/calling_convention.hlo b/third_party/xla/xla/service/gpu/tests/calling_convention.hlo
index c84e0194c347cb..ba00bd6423aa74 100644
--- a/third_party/xla/xla/service/gpu/tests/calling_convention.hlo
+++ b/third_party/xla/xla/service/gpu/tests/calling_convention.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // Arguments are passed separately.
diff --git a/third_party/xla/xla/service/gpu/tests/copy.hlo b/third_party/xla/xla/service/gpu/tests/copy.hlo
index beac8e6d36b115..997cefda91b22a 100644
--- a/third_party/xla/xla/service/gpu/tests/copy.hlo
+++ b/third_party/xla/xla/service/gpu/tests/copy.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
diff --git a/third_party/xla/xla/service/gpu/tests/dynamic_update_slice_inplace.hlo b/third_party/xla/xla/service/gpu/tests/dynamic_update_slice_inplace.hlo
index 3d0af18b081103..974fb26e5a9193 100644
--- a/third_party/xla/xla/service/gpu/tests/dynamic_update_slice_inplace.hlo
+++ b/third_party/xla/xla/service/gpu/tests/dynamic_update_slice_inplace.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
diff --git a/third_party/xla/xla/service/gpu/tests/element_wise_row_vectorization.hlo b/third_party/xla/xla/service/gpu/tests/element_wise_row_vectorization.hlo
index b49e155da0a685..05d2b141757621 100644
--- a/third_party/xla/xla/service/gpu/tests/element_wise_row_vectorization.hlo
+++ b/third_party/xla/xla/service/gpu/tests/element_wise_row_vectorization.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/v100.txtpb --split-input-file | FileCheck %s
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK-LLVM %s
 // We check that the row loads are vectorized.
diff --git a/third_party/xla/xla/service/gpu/tests/fused_scatter.hlo b/third_party/xla/xla/service/gpu/tests/fused_scatter.hlo
index 9a30436ebfa38c..e11711b8ba9556 100644
--- a/third_party/xla/xla/service/gpu/tests/fused_scatter.hlo
+++ b/third_party/xla/xla/service/gpu/tests/fused_scatter.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
diff --git a/third_party/xla/xla/service/gpu/tests/fused_slice.hlo b/third_party/xla/xla/service/gpu/tests/fused_slice.hlo
index 4affcb0de7533b..b5abb7dafa5960 100644
--- a/third_party/xla/xla/service/gpu/tests/fused_slice.hlo
+++ b/third_party/xla/xla/service/gpu/tests/fused_slice.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
diff --git a/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo b/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
index 41734e06259a00..431edac2748f42 100644
--- a/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
+++ b/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // All fusions must reuse the same kernel:
diff --git a/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo b/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo
index bcfa37733f7e67..ecdb8e91df4342 100644
--- a/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo
+++ b/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 // This tests that we do not increase the grid launch size when
 // few_waves is enabled.
diff --git a/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo b/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo
index 6e147df3928c09..5f2d6d64eb829d 100644
--- a/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo
+++ b/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_atomic_min.hlo b/third_party/xla/xla/service/gpu/tests/reduce_atomic_min.hlo
index c7165c9e11763c..05537b327fdc57 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_atomic_min.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_atomic_min.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s --check-prefixes=CHECK,CHECK-%{PTX}
 
 // Check that for "min" we are still using atomics (CAS loop).
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_column_layout_change.hlo b/third_party/xla/xla/service/gpu/tests/reduce_column_layout_change.hlo
index cb30643886de4e..1b30f13e0e1af9 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_column_layout_change.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_column_layout_change.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s --check-prefixes=CHECK,CHECK-%{PTX}
 
 HloModule reduce_with_layout_change, is_scheduled=true
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_f64_column.hlo b/third_party/xla/xla/service/gpu/tests/reduce_f64_column.hlo
index 982e45863e2547..70138e77d9362b 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_f64_column.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_f64_column.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s --check-prefixes=CHECK,CHECK-%{PTX}
 
 HloModule m, is_scheduled=true
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo b/third_party/xla/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo
index 736e583ad4c3c9..5fa7f4ed5b5c2c 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK,CHECK-%{PTX}
 
 HloModule LargeReduction, is_scheduled=true
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_row_vectorized.hlo b/third_party/xla/xla/service/gpu/tests/reduce_row_vectorized.hlo
index bba7986d830fb3..7107fd24d491f2 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_row_vectorized.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_row_vectorized.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s --check-prefixes=CHECK,CHECK-%{PTX}
 
 HloModule RowReductionVectorized, is_scheduled=true
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_unnested.hlo b/third_party/xla/xla/service/gpu/tests/reduce_unnested.hlo
index 844c3ded2ef024..919a4dc1e9c7a9 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_unnested.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_unnested.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck %s
 
 // CHECK: define void @fusion_row_reduction_too_small(
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_variadic_column.hlo b/third_party/xla/xla/service/gpu/tests/reduce_variadic_column.hlo
index 36008daa5ceda8..64029a503506b8 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_variadic_column.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_variadic_column.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s --check-prefixes=CHECK,CHECK-%{PTX}
 
 HloModule Test, is_scheduled=true
diff --git a/third_party/xla/xla/service/gpu/tests/reduction_vectorization_sm_all.hlo b/third_party/xla/xla/service/gpu/tests/reduction_vectorization_sm_all.hlo
index 6a25580a4bcff9..a1d7eb6aa38619 100644
--- a/third_party/xla/xla/service/gpu/tests/reduction_vectorization_sm_all.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduction_vectorization_sm_all.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck %s
 // RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/p100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM60
 // RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/v100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM70
diff --git a/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo b/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo
index e140b56af9d60c..7836f3ee1ddbe9 100644
--- a/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo
+++ b/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule TestModule, is_scheduled=true
diff --git a/third_party/xla/xla/service/gpu/tests/scatter.hlo b/third_party/xla/xla/service/gpu/tests/scatter.hlo
index 20211bdbe892f4..b81113587814da 100644
--- a/third_party/xla/xla/service/gpu/tests/scatter.hlo
+++ b/third_party/xla/xla/service/gpu/tests/scatter.hlo
@@ -1,5 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // CHECK-LABEL: entry:
diff --git a/third_party/xla/xla/service/gpu/tests/select_and_scatter.hlo b/third_party/xla/xla/service/gpu/tests/select_and_scatter.hlo
index 08751943c13efb..587a605dca24ee 100644
--- a/third_party/xla/xla/service/gpu/tests/select_and_scatter.hlo
+++ b/third_party/xla/xla/service/gpu/tests/select_and_scatter.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
diff --git a/third_party/xla/xla/service/gpu/tests/single_instruction.hlo b/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
index c8378f746aa983..3fdbc565981679 100644
--- a/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
+++ b/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck %s
 // RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/a100_80.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM80
 // RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/h100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM90
diff --git a/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo b/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
index 242bd749bdaf11..a62181874c323c 100644
--- a/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
+++ b/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
diff --git a/third_party/xla/xla/service/gpu/tests/transpose_021.hlo b/third_party/xla/xla/service/gpu/tests/transpose_021.hlo
index 7d3e1fe0ffb9f6..ef8580fa62ac77 100644
--- a/third_party/xla/xla/service/gpu/tests/transpose_021.hlo
+++ b/third_party/xla/xla/service/gpu/tests/transpose_021.hlo
@@ -1,4 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 HloModule Transpose, is_scheduled=true
diff --git a/third_party/xla/xla/service/gpu/tests/transpose_021_extra_output.hlo b/third_party/xla/xla/service/gpu/tests/transpose_021_extra_output.hlo
index 5e638321294f1e..659a9e2abf3dec 100644
--- a/third_party/xla/xla/service/gpu/tests/transpose_021_extra_output.hlo
+++ b/third_party/xla/xla/service/gpu/tests/transpose_021_extra_output.hlo
@@ -1,4 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 HloModule Transpose, is_scheduled=true
diff --git a/third_party/xla/xla/service/gpu/tests/transpose_210.hlo b/third_party/xla/xla/service/gpu/tests/transpose_210.hlo
index f37bd17ffe2a6e..1fb1dd139797da 100644
--- a/third_party/xla/xla/service/gpu/tests/transpose_210.hlo
+++ b/third_party/xla/xla/service/gpu/tests/transpose_210.hlo
@@ -1,4 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 HloModule Transpose, is_scheduled=true
diff --git a/third_party/xla/xla/service/gpu/tests/transpose_210_extra_output.hlo b/third_party/xla/xla/service/gpu/tests/transpose_210_extra_output.hlo
index a3831d2da1de52..b9693dfc388679 100644
--- a/third_party/xla/xla/service/gpu/tests/transpose_210_extra_output.hlo
+++ b/third_party/xla/xla/service/gpu/tests/transpose_210_extra_output.hlo
@@ -1,4 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 HloModule Transpose, is_scheduled=true
diff --git a/third_party/xla/xla/service/gpu/tests/triton_naming.hlo b/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
index 2739e349181786..36704d7fcb280f 100644
--- a/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
+++ b/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
 
 // CHECK-PTX: define void @triton_gemm_r(
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index 234649a31ea65b..30d0d3d48c51dd 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -24,7 +24,7 @@ load(
     "tf_additional_cudnn_plugin_copts",
     "tf_additional_gpu_compilation_copts",
 )
-load("//xla/tsl:tsl.bzl", "if_google", "if_nccl", "internal_visibility", "tsl_copts")
+load("//xla/tsl:tsl.bzl", "if_google", "if_hermetic_cuda_tools", "if_nccl", "internal_visibility", "tsl_copts")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -113,13 +113,31 @@ cuda_only_cc_library(
 
 # Buildozer can not remove dependencies inside select guards, so we have to use
 # an intermediate target.
-cc_library(name = "ptxas_wrapper")
+cc_library(
+    name = "ptxas_wrapper",
+    data = if_hermetic_cuda_tools(
+        ["@cuda_nvcc//:ptxas"],
+        [],
+    ),
+)
 
-cc_library(name = "nvlink_wrapper")
+cc_library(
+    name = "nvlink_wrapper",
+    data = if_hermetic_cuda_tools(
+        ["@cuda_nvcc//:nvlink"],
+        [],
+    ),
+)
 
 # Buildozer can not remove dependencies inside select guards, so we have to use
 # an intermediate target.
-cc_library(name = "fatbinary_wrapper")
+cc_library(
+    name = "fatbinary_wrapper",
+    data = if_hermetic_cuda_tools(
+        ["@cuda_nvcc//:fatbinary"],
+        [],
+    ),
+)
 
 cuda_only_cc_library(
     name = "cuda_driver",
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_hlo.hlo b/third_party/xla/xla/tools/hlo_opt/gpu_hlo.hlo
index 0633b7e5ef7ce8..8ab7b9039faf0f 100755
--- a/third_party/xla/xla/tools/hlo_opt/gpu_hlo.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_hlo.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/gpu_specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule module
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_backend.hlo b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_backend.hlo
index 61b6b9aa778b9e..66317c2d276c46 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_backend.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_backend.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=hlo-backend --xla_gpu_target_config_filename=%S/gpu_specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule module
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_buffers.hlo b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_buffers.hlo
index e7b8321cc6480b..a706e7c75df0d1 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_buffers.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_buffers.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=buffer-assignment --xla_gpu_target_config_filename=%S/gpu_specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule m
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_llvm.hlo b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_llvm.hlo
index 59800a9d170560..2eb00d4cac81b8 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_llvm.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_llvm.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm --xla_gpu_target_config_filename=%S/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 HloModule m
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_ptx.hlo b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_ptx.hlo
index 5c6485a57813a8..fae7ed1437107d 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_ptx.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_ptx.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=CUDA --stage=ptx --xla_gpu_target_config_filename=%S/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck %s
 
 HloModule m
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_unoptimized_llvm.hlo b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_unoptimized_llvm.hlo
index 6c8bc8bd54fe6a..63766db3e0b039 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_unoptimized_llvm.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_unoptimized_llvm.hlo
@@ -1,3 +1,4 @@
+// RUN: export XLA_FLAGS="--xla_gpu_cuda_data_dir=%S/../../../../cuda_nvcc"
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/gpu_specs/%{GPU}.txtpb | FileCheck %s
 
 // CHECK:       fusion.in_bounds-true:
diff --git a/third_party/xla/xla/tsl/BUILD b/third_party/xla/xla/tsl/BUILD
index 8a7cb42086d5d5..47d112444da58e 100644
--- a/third_party/xla/xla/tsl/BUILD
+++ b/third_party/xla/xla/tsl/BUILD
@@ -1,7 +1,7 @@
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("@bazel_skylib//lib:selects.bzl", "selects")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag", "bool_setting")
-load("tsl.bzl", "if_google", "if_oss")
+load("tsl.bzl", "if_google", "if_hermetic_cuda_libs", "if_oss")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
@@ -551,3 +551,25 @@ cc_library(
     }),
 )
 # copybara:comment_end
+
+cc_library(
+    name = "gpu_runtime_hermetic_cuda_deps",
+    tags = ["manual"],
+    visibility = ["//visibility:public"],
+    deps = if_hermetic_cuda_libs([
+        "@cuda_cudart//:cudart",
+        "@cuda_cudnn//:cudnn",
+        "@cuda_cudnn//:cudnn_ops_infer",
+        "@cuda_cudnn//:cudnn_cnn_infer",
+        "@cuda_cudnn//:cudnn_ops_train",
+        "@cuda_cudnn//:cudnn_cnn_train",
+        "@cuda_cudnn//:cudnn_adv_infer",
+        "@cuda_cudnn//:cudnn_adv_train",
+        "@cuda_cublas//:cublas",
+        "@cuda_cublas//:cublasLt",
+        "@cuda_cusolver//:cusolver",
+        "@cuda_cufft//:cufft",
+        "@cuda_cusparse//:cusparse",
+        "@cuda_nvjitlink//:nvjitlink",
+    ]),
+)
diff --git a/third_party/xla/xla/tsl/cuda/BUILD.bazel b/third_party/xla/xla/tsl/cuda/BUILD.bazel
index 6f0e9aefab72f0..0992a9bcf1c775 100644
--- a/third_party/xla/xla/tsl/cuda/BUILD.bazel
+++ b/third_party/xla/xla/tsl/cuda/BUILD.bazel
@@ -10,6 +10,10 @@ load(
     "cuda_rpath_flags",
     "if_cuda_is_configured",
 )
+load(
+    "//xla/tsl:tsl.bzl",
+    "if_hermetic_cuda_libs",
+)
 load("//xla/tsl/cuda:stub.bzl", "cuda_stub")
 
 package(
@@ -41,6 +45,8 @@ cc_library(
         "@local_tsl//tsl/platform:dso_loader",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:load_library",
+    ]) + if_hermetic_cuda_libs([
+        "@cuda_cublas//:cublas",
     ]),
 )
 
@@ -65,6 +71,8 @@ cc_library(
         "@local_tsl//tsl/platform:dso_loader",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:load_library",
+    ]) + if_hermetic_cuda_libs([
+        "@cuda_cublas//:cublasLt",
     ]),
 )
 
@@ -126,7 +134,9 @@ cc_library(
             "@local_tsl//tsl/platform:logging",
         ],
         "//conditions:default": [],
-    }),
+    }) + if_hermetic_cuda_libs([
+        "@cuda_cudart//:cudart",
+    ]),
 )
 
 cuda_stub(
@@ -152,6 +162,14 @@ cc_library(
         "@local_tsl//tsl/platform:dso_loader",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:load_library",
+    ]) + if_hermetic_cuda_libs([
+        "@cuda_cudnn//:cudnn",
+        "@cuda_cudnn//:cudnn_ops_infer",
+        "@cuda_cudnn//:cudnn_cnn_infer",
+        "@cuda_cudnn//:cudnn_ops_train",
+        "@cuda_cudnn//:cudnn_cnn_train",
+        "@cuda_cudnn//:cudnn_adv_infer",
+        "@cuda_cudnn//:cudnn_adv_train",
     ]),
 )
 
@@ -189,6 +207,8 @@ cc_library(
         "@local_tsl//tsl/platform:dso_loader",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:load_library",
+    ]) + if_hermetic_cuda_libs([
+        "@cuda_cufft//:cufft",
     ]),
 )
 
@@ -216,6 +236,8 @@ cc_library(
         "@local_tsl//tsl/platform:dso_loader",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:load_library",
+    ]) + if_hermetic_cuda_libs([
+        "@cuda_cupti//:cupti",
     ]),
 )
 
@@ -241,6 +263,8 @@ cc_library(
         "@local_tsl//tsl/platform:dso_loader",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:load_library",
+    ]) + if_hermetic_cuda_libs([
+        "@cuda_cusolver//:cusolver",
     ]),
 )
 
@@ -266,6 +290,9 @@ cc_library(
         "@local_tsl//tsl/platform:dso_loader",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:load_library",
+    ]) + if_hermetic_cuda_libs([
+        "@cuda_cusparse//:cusparse",
+        "@cuda_nvjitlink//:nvjitlink",
     ]),
 )
 
@@ -293,5 +320,7 @@ cc_library(
         "@local_tsl//tsl/platform:dso_loader",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:load_library",
+    ]) + if_hermetic_cuda_libs([
+        "@cuda_nccl//:nccl",
     ]),
 )
diff --git a/third_party/xla/xla/tsl/tsl.bzl b/third_party/xla/xla/tsl/tsl.bzl
index 5ff893e03edda8..505a4deb8a4518 100644
--- a/third_party/xla/xla/tsl/tsl.bzl
+++ b/third_party/xla/xla/tsl/tsl.bzl
@@ -224,6 +224,17 @@ def if_with_tpu_support(if_true, if_false = []):
         "//conditions:default": if_false,
     })
 
+# These configs are used to determine whether we should use the hermetic CUDA
+# tools in cc_libraries (see go/hermetic-cuda).
+# They are intended for the OSS builds only.
+def if_hermetic_cuda_tools(if_true, if_false = []):
+    """Shorthand for select()'ing on whether we're building with hermetic CUDA tools."""
+    return select({"@local_config_cuda//cuda:hermetic_cuda_tools": if_true, "//conditions:default": if_false})  # copybara:comment_replace return if_false
+
+def if_hermetic_cuda_libs(if_true, if_false = []):
+    """Shorthand for select()'ing on whether we need to include hermetic CUDA libraries."""
+    return select({"@local_config_cuda//cuda:hermetic_cuda_tools_and_libs": if_true, "//conditions:default": if_false})  # copybara:comment_replace return if_false
+
 def get_win_copts(is_external = False):
     WINDOWS_COPTS = [
         # copybara:uncomment_begin(no MSVC flags in google)