Skip to content

Commit

Permalink
Minor codegen improvements (#225)
Browse files Browse the repository at this point in the history
  • Loading branch information
vosen committed May 5, 2024
1 parent bdc652f commit 27c0e13
Show file tree
Hide file tree
Showing 133 changed files with 1,543 additions and 1,341 deletions.
88 changes: 54 additions & 34 deletions ptx/src/emit.rs
Expand Up @@ -7,7 +7,7 @@ use std::ffi::CStr;
use std::fmt::Display;
use std::io::Write;
use std::ptr::null_mut;
use std::{convert, iter, mem, ptr};
use std::{iter, mem, ptr};
use zluda_llvm::core::*;
use zluda_llvm::prelude::*;
use zluda_llvm::zluda::*;
Expand Down Expand Up @@ -157,7 +157,7 @@ impl NamedIdGenerator {
if let Some(id) = id {
self.register_result(id, func)
} else {
func(b"\0".as_ptr() as _)
func(LLVM_UNNAMED)
}
}

Expand Down Expand Up @@ -505,10 +505,12 @@ fn emit_function_variable(
) -> Result<(), TranslateError> {
let builder = ctx.builder.get();
let llvm_type = get_llvm_type(ctx, &variable.type_)?;
let addr_space = get_llvm_address_space(&ctx.constants, variable.state_space)?;
let value = ctx.names.register_result(variable.name, |name| unsafe {
LLVMZludaBuildAlloca(builder, llvm_type, addr_space, name)
});
let value = emit_alloca(
ctx,
llvm_type,
get_llvm_address_space(&ctx.constants, variable.state_space)?,
Some(variable.name),
);
match variable.initializer {
None => {}
Some(init) => {
Expand All @@ -531,12 +533,27 @@ fn emit_method<'a, 'input>(
let llvm_method = emit_method_declaration(ctx, &method)?;
emit_linkage_for_method(&method, is_kernel, llvm_method);
emit_tuning(ctx, llvm_method, &method.tuning);
for statement in method.body.iter().flat_map(convert::identity) {
let statements = match method.body {
Some(statements) => statements,
None => return Ok(()),
};
// Initial BB that holds all the variable declarations
let bb_with_variables =
unsafe { LLVMAppendBasicBlockInContext(ctx.context.get(), llvm_method, LLVM_UNNAMED) };
// Rest of the code
let starting_bb =
unsafe { LLVMAppendBasicBlockInContext(ctx.context.get(), llvm_method, LLVM_UNNAMED) };
unsafe { LLVMPositionBuilderAtEnd(ctx.builder.get(), starting_bb) };
for statement in statements.iter() {
register_basic_blocks(ctx, llvm_method, statement);
}
for statement in method.body.into_iter().flatten() {
for statement in statements.into_iter() {
emit_statement(ctx, is_kernel, statement)?;
}
// happens if there is a post-ret trailing label
terminate_current_block_if_needed(ctx, None);
unsafe { LLVMPositionBuilderAtEnd(ctx.builder.get(), bb_with_variables) };
unsafe { LLVMBuildBr(ctx.builder.get(), starting_bb) };
Ok(())
}

Expand Down Expand Up @@ -604,7 +621,6 @@ fn emit_statement(
is_kernel: bool,
statement: crate::translate::ExpandedStatement,
) -> Result<(), TranslateError> {
start_synthetic_basic_block_if_needed(ctx, &statement);
Ok(match statement {
crate::translate::Statement::Label(label) => emit_label(ctx, label)?,
crate::translate::Statement::Variable(var) => emit_function_variable(ctx, var)?,
Expand Down Expand Up @@ -749,27 +765,6 @@ fn emit_ret_value(
Ok(())
}

fn start_synthetic_basic_block_if_needed(
ctx: &mut EmitContext,
statement: &crate::translate::ExpandedStatement,
) {
let current_block = unsafe { LLVMGetInsertBlock(ctx.builder.get()) };
if current_block == ptr::null_mut() {
return;
}
let terminator = unsafe { LLVMGetBasicBlockTerminator(current_block) };
if terminator == ptr::null_mut() {
return;
}
if let crate::translate::Statement::Label(..) = statement {
return;
}
let new_block =
unsafe { LLVMCreateBasicBlockInContext(ctx.context.get(), b"\0".as_ptr() as _) };
unsafe { LLVMInsertExistingBasicBlockAfterInsertBlock(ctx.builder.get(), new_block) };
unsafe { LLVMPositionBuilderAtEnd(ctx.builder.get(), new_block) };
}

fn emit_ptr_access(
ctx: &mut EmitContext,
ptr_access: &crate::translate::PtrAccess<crate::translate::ExpandedArgParams>,
Expand Down Expand Up @@ -1073,14 +1068,36 @@ fn emit_value_copy(
) -> Result<(), TranslateError> {
let builder = ctx.builder.get();
let type_ = get_llvm_type(ctx, type_)?;
let temp_value = unsafe { LLVMBuildAlloca(builder, type_, LLVM_UNNAMED) };
let temp_value = emit_alloca(ctx, type_, ctx.constants.private_space, None);
unsafe { LLVMBuildStore(builder, src, temp_value) };
ctx.names.register_result(dst, |dst| unsafe {
LLVMBuildLoad2(builder, type_, temp_value, dst)
});
Ok(())
}

// From "Performance Tips for Frontend Authors" (https://llvm.org/docs/Frontend/PerformanceTips.html):
// "The SROA (Scalar Replacement Of Aggregates) and Mem2Reg passes only attempt to eliminate alloca
// instructions that are in the entry basic block. Given SSA is the canonical form expected by much
// of the optimizer; if allocas can not be eliminated by Mem2Reg or SROA, the optimizer is likely to
// be less effective than it could be."
fn emit_alloca(
ctx: &mut EmitContext,
type_: LLVMTypeRef,
addr_space: u32,
name: Option<Id>,
) -> LLVMValueRef {
let builder = ctx.builder.get();
let current_bb = unsafe { LLVMGetInsertBlock(builder) };
let variables_bb = unsafe { LLVMGetFirstBasicBlock(LLVMGetBasicBlockParent(current_bb)) };
unsafe { LLVMPositionBuilderAtEnd(builder, variables_bb) };
let result = ctx.names.register_result_option(name, |name| unsafe {
LLVMZludaBuildAlloca(builder, type_, addr_space, name)
});
unsafe { LLVMPositionBuilderAtEnd(builder, current_bb) };
result
}

fn emit_instruction(
ctx: &mut EmitContext,
is_kernel: bool,
Expand Down Expand Up @@ -3494,12 +3511,12 @@ fn emit_store_var(

fn emit_label(ctx: &mut EmitContext, label: Id) -> Result<(), TranslateError> {
let new_block = unsafe { LLVMValueAsBasicBlock(ctx.names.value(label)?) };
terminate_current_block_if_needed(ctx, new_block);
terminate_current_block_if_needed(ctx, Some(new_block));
unsafe { LLVMPositionBuilderAtEnd(ctx.builder.get(), new_block) };
Ok(())
}

fn terminate_current_block_if_needed(ctx: &mut EmitContext, new_block: LLVMBasicBlockRef) {
fn terminate_current_block_if_needed(ctx: &mut EmitContext, new_block: Option<LLVMBasicBlockRef>) {
let current_block = unsafe { LLVMGetInsertBlock(ctx.builder.get()) };
if current_block == ptr::null_mut() {
return;
Expand All @@ -3508,7 +3525,10 @@ fn terminate_current_block_if_needed(ctx: &mut EmitContext, new_block: LLVMBasic
if terminator != ptr::null_mut() {
return;
}
unsafe { LLVMBuildBr(ctx.builder.get(), new_block) };
match new_block {
Some(new_block) => unsafe { LLVMBuildBr(ctx.builder.get(), new_block) },
None => unsafe { LLVMBuildUnreachable(ctx.builder.get()) },
};
}

fn emit_method_declaration<'input>(
Expand Down
14 changes: 8 additions & 6 deletions ptx/src/test/spirv_run/abs.ll
Expand Up @@ -2,13 +2,15 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
target triple = "amdgcn-amd-amdhsa"

define protected amdgpu_kernel void @abs(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 {
"37":
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
br label %1

1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"27", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"28", align 8
Expand All @@ -19,8 +21,8 @@ define protected amdgpu_kernel void @abs(ptr addrspace(4) byref(i64) %"27", ptr
store i32 %"29", ptr addrspace(5) %"6", align 4
%"14" = load i64, ptr addrspace(5) %"4", align 8
%"31" = inttoptr i64 %"14" to ptr
%"39" = getelementptr inbounds i8, ptr %"31", i64 4
%"32" = load i32, ptr %"39", align 4
%"38" = getelementptr inbounds i8, ptr %"31", i64 4
%"32" = load i32, ptr %"38", align 4
store i32 %"32", ptr addrspace(5) %"7", align 4
%"16" = load i32, ptr addrspace(5) %"6", align 4
%"15" = call i32 @llvm.abs.i32(i32 %"16", i1 false)
Expand All @@ -35,8 +37,8 @@ define protected amdgpu_kernel void @abs(ptr addrspace(4) byref(i64) %"27", ptr
%"21" = load i64, ptr addrspace(5) %"5", align 8
%"22" = load i32, ptr addrspace(5) %"7", align 4
%"35" = inttoptr i64 %"21" to ptr
%"41" = getelementptr inbounds i8, ptr %"35", i64 4
store i32 %"22", ptr %"41", align 4
%"40" = getelementptr inbounds i8, ptr %"35", i64 4
store i32 %"22", ptr %"40", align 4
ret void
}

Expand Down
6 changes: 4 additions & 2 deletions ptx/src/test/spirv_run/activemask.ll
Expand Up @@ -4,11 +4,13 @@ target triple = "amdgcn-amd-amdhsa"
declare i32 @__zluda_ptx_impl__activemask() #0

define protected amdgpu_kernel void @activemask(ptr addrspace(4) byref(i64) %"11", ptr addrspace(4) byref(i64) %"12") #1 {
"15":
%"6" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"6", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i32, align 4, addrspace(5)
br label %1

1: ; preds = %0
store i1 false, ptr addrspace(5) %"6", align 1
%"7" = load i64, ptr addrspace(4) %"12", align 8
store i64 %"7", ptr addrspace(5) %"4", align 8
%"8" = call i32 @__zluda_ptx_impl__activemask()
Expand Down
6 changes: 4 additions & 2 deletions ptx/src/test/spirv_run/add.ll
Expand Up @@ -2,13 +2,15 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
target triple = "amdgcn-amd-amdhsa"

define protected amdgpu_kernel void @add(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
"22":
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
br label %1

1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"19", align 8
Expand Down
6 changes: 4 additions & 2 deletions ptx/src/test/spirv_run/add_global.ll
Expand Up @@ -4,13 +4,15 @@ target triple = "amdgcn-amd-amdhsa"
@PI = protected addrspace(1) externally_initialized global float 0x400921FB60000000, align 4

define protected amdgpu_kernel void @add_global(ptr addrspace(4) byref(i64) %"20", ptr addrspace(4) byref(i64) %"21") #0 {
"24":
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca float, align 4, addrspace(5)
%"8" = alloca float, align 4, addrspace(5)
br label %1

1: ; preds = %0
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = load i64, ptr addrspace(4) %"20", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"11" = load i64, ptr addrspace(4) %"21", align 8
Expand Down
6 changes: 4 additions & 2 deletions ptx/src/test/spirv_run/add_non_coherent.ll
Expand Up @@ -2,13 +2,15 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
target triple = "amdgcn-amd-amdhsa"

define protected amdgpu_kernel void @add_non_coherent(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
"22":
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
br label %1

1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"19", align 8
Expand Down
26 changes: 14 additions & 12 deletions ptx/src/test/spirv_run/add_param_ptr.ll
Expand Up @@ -2,32 +2,34 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
target triple = "amdgcn-amd-amdhsa"

define protected amdgpu_kernel void @add_param_ptr(ptr addrspace(4) byref(i64) %"26", ptr addrspace(4) byref(i64) %"27") #0 {
"38":
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
%1 = alloca i64, align 8, addrspace(5)
%2 = alloca i64, align 8, addrspace(5)
br label %3

3: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"31" = ptrtoint ptr addrspace(4) %"26" to i64
%0 = alloca i64, align 8, addrspace(5)
store i64 %"31", ptr addrspace(5) %0, align 8
%"30" = load i64, ptr addrspace(5) %0, align 8
store i64 %"31", ptr addrspace(5) %1, align 8
%"30" = load i64, ptr addrspace(5) %1, align 8
store i64 %"30", ptr addrspace(5) %"4", align 8
%"33" = ptrtoint ptr addrspace(4) %"27" to i64
%1 = alloca i64, align 8, addrspace(5)
store i64 %"33", ptr addrspace(5) %1, align 8
%"32" = load i64, ptr addrspace(5) %1, align 8
store i64 %"33", ptr addrspace(5) %2, align 8
%"32" = load i64, ptr addrspace(5) %2, align 8
store i64 %"32", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"34" = inttoptr i64 %"12" to ptr addrspace(4)
%"40" = getelementptr inbounds i8, ptr addrspace(4) %"34", i64 0
%"11" = load i64, ptr addrspace(4) %"40", align 8
%"39" = getelementptr inbounds i8, ptr addrspace(4) %"34", i64 0
%"11" = load i64, ptr addrspace(4) %"39", align 8
store i64 %"11", ptr addrspace(5) %"4", align 8
%"14" = load i64, ptr addrspace(5) %"5", align 8
%"35" = inttoptr i64 %"14" to ptr addrspace(4)
%"42" = getelementptr inbounds i8, ptr addrspace(4) %"35", i64 0
%"13" = load i64, ptr addrspace(4) %"42", align 8
%"41" = getelementptr inbounds i8, ptr addrspace(4) %"35", i64 0
%"13" = load i64, ptr addrspace(4) %"41", align 8
store i64 %"13", ptr addrspace(5) %"5", align 8
%"16" = load i64, ptr addrspace(5) %"4", align 8
%"36" = inttoptr i64 %"16" to ptr
Expand Down
6 changes: 4 additions & 2 deletions ptx/src/test/spirv_run/add_tuning.ll
Expand Up @@ -2,13 +2,15 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
target triple = "amdgcn-amd-amdhsa"

define protected amdgpu_kernel void @add_tuning(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
"22":
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
br label %1

1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"19", align 8
Expand Down

0 comments on commit 27c0e13

Please sign in to comment.