[xla:cpu] NFC: Remove deprecated XLA:CPU mlir based codegen part #4

PiperOrigin-RevId: 630139768
tensorflow · May 2, 2024 · 69b7c1c · 69b7c1c
1 parent b470d92
commit 69b7c1c
Show file tree

Hide file tree

Showing 6 changed files with 16 additions and 308 deletions.
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
@@ -245,19 +245,10 @@ cc_library(
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_module_group",
-        "//xla/mlir/framework/ir:xla_framework",
-        "//xla/mlir/runtime/ir:rt",
-        "//xla/mlir/runtime/transforms:calling_convention",
-        "//xla/mlir/runtime/transforms:compilation_pipeline_cpu",
-        "//xla/mlir/runtime/transforms:compiler",
-        "//xla/mlir/runtime/transforms:jit_compiler",
         "//xla/mlir_hlo",
         "//xla/mlir_hlo:all_passes",
         "//xla/mlir_hlo:mhlo_passes",
         "//xla/mlir_hlo:transforms_passes",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/runtime:jit_executable",
         "//xla/service:algebraic_simplifier",
         "//xla/service:all_reduce_promotion",
         "//xla/service:all_to_all_decomposer",
@@ -564,9 +555,6 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
-        "//xla/mlir/runtime/transforms:compiler",
-        "//xla/runtime:executable",
-        "//xla/runtime:jit_executable",
         "//xla/service:buffer_assignment",
         "//xla/service:computation_layout",
         "//xla/service:custom_call_status_internal",
@@ -808,7 +796,6 @@ cc_library(
         "//xla:statusor",
         "//xla:types",
         "//xla:util",
-        "//xla/runtime:execution_engine",
         "//xla/service:llvm_compiler",
         "//xla/service/llvm_ir:llvm_util",
         "@com_google_absl//absl/functional:any_invocable",

diff --git a/third_party/xla/xla/service/cpu/compiler_functor.cc b/third_party/xla/xla/service/cpu/compiler_functor.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h"
-#include "xla/runtime/execution_engine.h"
 #include "xla/service/cpu/cpu_runtime.h"
 #include "xla/service/cpu/llvm_ir_runtime.h"
 #include "xla/service/llvm_ir/llvm_util.h"
@@ -160,20 +159,6 @@ llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> CompilerFunctor::operator()(
 
   llvm::ModulePassManager pm;
 
-  for (const auto& func_name : convert_to_xla_runtime_abi_) {
-    llvm::Function* func = module.getFunction(func_name);
-    // Create a new function with the XLA Runtime ABI and inline the original
-    // (i.e. with ctx + memref args) into it.
-    std::string inlined_func_name =
-        absl::StrCat(func_name, "__orig_xla_runtime_abi");
-    func->setName(inlined_func_name);
-    absl::Status status = xla::runtime::ExportWithXlaRuntimeAbi(
-        module, inlined_func_name, func_name);
-    if (!status.ok()) {
-      LOG(FATAL) << status.message();
-    }
-  }
-
   if (dfsan_enabled_) {
     pm.addPass(llvm::DataFlowSanitizerPass(dfsan_abi_list_files_));
   }

diff --git a/third_party/xla/xla/service/cpu/compiler_functor.h b/third_party/xla/xla/service/cpu/compiler_functor.h
@@ -46,8 +46,7 @@ class CompilerFunctor : public llvm::orc::IRCompileLayer::IRCompiler {
       absl::AnyInvocable<void(const llvm::object::ObjectFile&)>
           post_codegen_hook = nullptr,
       bool dfsan_enabled = false,
-      const std::vector<std::string>& dfsan_abi_list_files = {},
-      const std::vector<std::string>& convert_to_xla_runtime_abi = {})
+      const std::vector<std::string>& dfsan_abi_list_files = {})
       : IRCompiler(llvm::orc::IRSymbolMapper::ManglingOptions()),
         target_machine_(target_machine),
         opt_level_(opt_level),
@@ -59,8 +58,7 @@ class CompilerFunctor : public llvm::orc::IRCompileLayer::IRCompiler {
         post_optimization_hook_(std::move(post_optimization_hook)),
         post_codegen_hook_(std::move(post_codegen_hook)),
         dfsan_enabled_(dfsan_enabled),
-        dfsan_abi_list_files_(dfsan_abi_list_files),
-        convert_to_xla_runtime_abi_(convert_to_xla_runtime_abi) {}
+        dfsan_abi_list_files_(dfsan_abi_list_files) {}
 
   // Compile a Module to an ObjectFile.
   llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> operator()(
@@ -78,7 +76,6 @@ class CompilerFunctor : public llvm::orc::IRCompileLayer::IRCompiler {
   absl::AnyInvocable<void(const llvm::object::ObjectFile&)> post_codegen_hook_;
   const bool dfsan_enabled_ = false;
   const std::vector<std::string> dfsan_abi_list_files_;
-  const std::vector<std::string> convert_to_xla_runtime_abi_;
 };
 
 }  // namespace cpu

diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -1412,11 +1412,6 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                                           obj_file.getData().size()));
       };
 
-      std::vector<std::string> xla_runtime_abi_conversions;
-      if (options.use_mlir_hlo_lowering()) {
-        xla_runtime_abi_conversions.push_back(options.entry_point_name());
-      }
-
       CompilerFunctor compiler_functor(
           target_machine.get(), static_cast<int>(opt_level),
           options::OptimizeForSizeRequested(module->config()),
@@ -1425,8 +1420,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
           llvm_ir::GetCpuFastMathFlags(module->config()),
           pre_optimization_ir_hook, post_optimization_ir_hook,
           post_codegen_hook, aot_options.sanitize_dataflow(),
-          aot_options.sanitize_abilists_dataflow(),
-          xla_runtime_abi_conversions);
+          aot_options.sanitize_abilists_dataflow());
       std::unique_ptr<llvm::MemoryBuffer> object_file =
           cantFail(compiler_functor(*llvm_module));
       ObjectFileData object_file_data(object_file->getBufferStart(),

diff --git a/third_party/xla/xla/service/cpu/cpu_executable.cc b/third_party/xla/xla/service/cpu/cpu_executable.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/mlir/runtime/transforms/compiler.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/computation_layout.h"
 #include "xla/service/logical_buffer.h"
@@ -56,8 +55,6 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
-namespace runtime = ::xla::runtime;
-
 absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
     std::unique_ptr<SimpleOrcJIT> jit,
     std::unique_ptr<const BufferAssignment> assignment,
@@ -95,15 +92,11 @@ absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
     std::unique_ptr<HloModule> hlo_module,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
-    std::unique_ptr<const BufferAssignment> assignment,
-    std::unique_ptr<XlaRuntimeCpuExecutable> xla_runtime_executable) {
+    std::unique_ptr<const BufferAssignment> assignment) {
   std::unique_ptr<CpuExecutable> executable(new CpuExecutable(
       std::move(hlo_module), std::move(hlo_profile_printer_data),
       std::move(hlo_profile_index_map), std::move(assignment)));
-  executable->set_ir_module_string(
-      xla_runtime_executable->GetExecutable().take_ir_module_string());
   executable->module_name_ = "main";
-  executable->xla_runtime_executable_ = std::move(xla_runtime_executable);
   return executable;
 }
 
@@ -237,33 +230,17 @@ Status CpuExecutable::ExecuteComputeFunction(
     }
   };
 
-  if (IsXlaRuntime()) {
-    std::vector<BufferDesc> descriptor_table;
-    descriptor_table.reserve(buffers.size());
-    for (const auto& buffer : buffers) {
-      const tensorflow::se::DeviceMemoryBase& base =
-          buffer.AsDeviceMemoryBase();
-      BufferDesc desc(const_cast<void*>(base.opaque()), base.size());
-      descriptor_table.push_back(std::move(desc));
-    }
-    Status status = ExecuteXlaRuntime(descriptor_table, run_options);
-    record_profile();
-    if (!status.ok()) {
-      return status;
-    }
-  } else {
-    XlaCustomCallStatus status;
-    // For the entry computation (like all global computations), all inputs and
-    // outputs are in the buffer table, and both the result pointer and args
-    // array pointers are unused (so we set them to 'nullptr').
-    compute_function_(nullptr, run_options, nullptr, buffer_pointers.data(),
-                      &status, profile_counters);
-    record_profile();
-    std::optional<absl::string_view> error_message =
-        CustomCallStatusGetMessage(&status);
-    if (error_message) {
-      return Internal("CustomCall failed: %s", *error_message);
-    }
+  XlaCustomCallStatus status;
+  // For the entry computation (like all global computations), all inputs and
+  // outputs are in the buffer table, and both the result pointer and args
+  // array pointers are unused (so we set them to 'nullptr').
+  compute_function_(nullptr, run_options, nullptr, buffer_pointers.data(),
+                    &status, profile_counters);
+  record_profile();
+  std::optional<absl::string_view> error_message =
+      CustomCallStatusGetMessage(&status);
+  if (error_message) {
+    return Internal("CustomCall failed: %s", *error_message);
   }
 
   return OkStatus();
@@ -369,162 +346,6 @@ absl::StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
   return std::move(result);
 }
 
-// Converts a BufferDesc to a MemrefDesc according to the given 'operand_type',
-// which should point to a runtime::MemrefType.
-// Note: 'descriptor_index' and 'operand_index' are just used for error
-// reporting.
-static absl::StatusOr<runtime::MemrefDesc> BufferToMemref(
-    const BufferDesc& descriptor, const runtime::Type& operand_type,
-    size_t descriptor_index, size_t operand_index) {
-  auto* memref = llvm::dyn_cast<runtime::MemrefType>(&operand_type);
-  if (!memref) {
-    return Internal(
-        "Cannot convert descriptor %zu (operand_index %zu): "
-        "the corresponding type in the signature is a %s, "
-        "not a MemrefType.",
-        descriptor_index, operand_index, operand_type.ToString());
-  }
-
-  absl::Span<const int64_t> dims = memref->sizes();
-
-  // Verify that the provided descriptor size matches that of the memref.
-  size_t n_elem = absl::c_accumulate(dims, size_t{1}, std::multiplies<>());
-  size_t expected_size =
-      primitive_util::ByteWidth(memref->element_type()) * n_elem;
-  if (LLVM_UNLIKELY(expected_size != descriptor.size())) {
-    return InvalidArgument(
-        "Cannot convert descriptor %zu (operand_index %zu): "
-        "buffer size is not equal to that expected from the element type: "
-        "got %zu vs expected %zu.",
-        descriptor_index, operand_index, descriptor.size(), expected_size);
-  }
-
-  auto fill_sizes_and_strides = [&](auto sizes, auto strides) {
-    size_t multiplier = 1;
-    for (int i = static_cast<int>(dims.size()) - 1; i >= 0; --i) {
-      size_t size = dims[i];
-      sizes[i] = size;
-      strides[i] = multiplier;
-      multiplier *= size;
-    }
-  };
-  return runtime::MemrefDesc(memref->rank(), memref->element_type(),
-                             descriptor.data(), /*offset=*/0,
-                             fill_sizes_and_strides);
-}
-
-// Executes from an XLA Runtime CPU executable, given a buffer descriptor table.
-// Relevant elements of the descriptor table (i.e. arguments and results) are
-// converted to MemrefDesc's according to the corresponding operands in the
-// runtime signature.
-Status XlaRuntimeCpuExecutable::Execute(
-    const std::vector<BufferDesc>& descriptor_table,
-    const ExecutableRunOptions* run_options) {
-  const runtime::FunctionType& signature = GetExecutable().runtime_signature();
-
-  size_t num_arguments = xla_framework_mapping_.inputs.size();
-  if (xla_framework_mapping_.output_is_tuple) {
-    num_arguments += xla_framework_mapping_.flattened_outputs.size();
-  } else if (xla_framework_mapping_.result != -1) {
-    num_arguments += 1;
-  }
-
-  // Verify that the number of arguments in the mapping matches the signature.
-  // Add one to num_arguments to account for the signature's execution context.
-  if (num_arguments + 1 != signature.num_operands()) {
-    return Internal(
-        "Wrong number of arguments: got %zu via XLA FrameworkMapping, expected "
-        "%d.",
-        num_arguments, static_cast<int>(signature.num_operands()) - 1);
-  }
-
-  std::vector<runtime::MemrefDesc> arguments;
-  arguments.reserve(num_arguments);
-
-  auto append_converted_buffer = [&](size_t descriptor_index) -> Status {
-    const BufferDesc& descriptor = descriptor_table[descriptor_index];
-
-    // Use 1-based index to account for the execution context.
-    size_t operand_index = arguments.size() + 1;
-    const runtime::Type* operand_type = signature.operand(operand_index);
-
-    absl::StatusOr<runtime::MemrefDesc> memref = BufferToMemref(
-        descriptor, *operand_type, descriptor_index, operand_index);
-    if (!memref.ok()) {
-      return memref.status();
-    }
-    arguments.push_back(std::move(*memref));
-    return OkStatus();
-  };
-
-  // Inputs come first; results come last.
-  for (int64_t index : xla_framework_mapping_.inputs) {
-    TF_RETURN_IF_ERROR(append_converted_buffer(index));
-  }
-
-  int64_t result_index = xla_framework_mapping_.result;
-  if (xla_framework_mapping_.output_is_tuple) {
-    size_t num_outputs = xla_framework_mapping_.flattened_outputs.size();
-    for (size_t i = 0; i < num_outputs; ++i) {
-      int64_t output_index = xla_framework_mapping_.flattened_outputs[i];
-
-      TF_RETURN_IF_ERROR(append_converted_buffer(output_index));
-
-      // Populate the output tuple with a pointer to this result.
-      // TODO(b/249078472): make this work with nested tuples, if needed.
-      assert(result_index != -1);
-      void** results =
-          static_cast<void**>(descriptor_table[result_index].data());
-      results[i] = descriptor_table[output_index].data();
-    }
-  } else if (result_index != -1) {
-    TF_RETURN_IF_ERROR(append_converted_buffer(result_index));
-  }
-
-  runtime::Executable::CallFrame call_frame;
-  // Skip verification. The MemrefDesc's we created above come from the runtime
-  // signature; verifying them against the same signature would be redundant.
-  if (auto status =
-          GetExecutable().InitializeCallFrame(arguments, &call_frame,
-                                              /*verify_arguments=*/false);
-      !status.ok()) {
-    return Internal("Failed to initialize call frame: %s.",
-                         status.message());
-  }
-
-  // No results to return; they are returned via out params.
-  runtime::NoResultConverter converter;
-
-  // Collect all emitted diagnostic messages.
-  std::string diagnostic;
-  runtime::DiagnosticEngine diagnostic_engine;
-  diagnostic_engine.AddHandler([&](runtime::Diagnostic& d) {
-    absl::StrAppend(&diagnostic, d.status().message());
-    return runtime::success();
-  });
-
-  runtime::CustomCall::UserData user_data(run_options);
-
-  runtime::Executable::ExecuteOpts opts;
-  opts.custom_call_data = &user_data;
-  opts.diagnostic_engine = &diagnostic_engine;
-  opts.custom_call_registry = &dynamic_custom_calls_;
-
-  // We don't expect to see any async tasks in the XLA Runtime executable.
-  opts.async_task_runner =
-      reinterpret_cast<runtime::AsyncTaskRunner*>(0xdeadbeef);
-
-  // Execute with the prepared call frame.
-  GetExecutable().Execute(call_frame, opts);
-  if (auto status = GetExecutable().ReturnResults(converter, &call_frame);
-      !status.ok()) {
-    return Internal("Failed to execute XLA Runtime executable: %s%s%s.",
-                         status.message(), diagnostic.empty() ? "" : ": ",
-                         diagnostic);
-  }
-  return OkStatus();
-}
-
 absl::StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     std::vector<ExecutionInput> arguments,
@@ -612,9 +433,6 @@ const InstructionValueSet& CpuExecutable::GetRootValueSet() const {
 }
 
 int64_t CpuExecutable::SizeOfGeneratedCodeInBytes() const {
-  // TODO(b/233850967): support profiling in XLA:CPU-Next, instead of
-  // punting on it as we are doing here.
-  if (IsXlaRuntime()) return 0;
   return jit_->SizeOfGeneratedCodeInBytes();
 }