Migrate scalar in-place updates to GPU #1665

athas · 2022-05-03T06:11:25Z

Consider this program:

def main (xs: *[]i64) (ys: []i64) =
  #[unsafe]
  let i = i64.sum xs
  in xs with [i] = ys[i]

(The unsafe is not important, but just makes the IR simpler to look at.) Currently the in-place updates is located on the host:

entry("main",
      {xs: *direct, ys: direct},
      {*direct})
  entry_main (d₀_4677 : i64, d₁_4678 : i64, xs_4679 : *[d₀_4677]i64,
              ys_4680 : [d₁_4678]i64)
  : {*[d₀_4677]i64} = {
  let {segred_group_size_4690 : i64} =
    get_size(segred_group_size_4689, group_size)
  let {num_groups_4692 : i64} =
    calc_num_groups(d₀_4677, segred_num_groups_4691, segred_group_size_4690)
  let {defunc_2_reduce_res_4694 : [1i64]i64} =
    segred(thread; #groups=num_groups_4692; groupsize=segred_group_size_4690)
    (dummy_4695 < 1i64, gtid_4696 < d₀_4677) (~phys_tid_4697)
    ({0i64},
     ,
     commutative \ {x_4682 : i64, x_4683 : i64}
       : {i64} ->
       let {defunc_1_op_res_4684 : i64} = add64(x_4682, x_4683)
       in {defunc_1_op_res_4684})
    : {i64} {
      let {x_4685 : i64} =
        xs_4679[gtid_4696]
      return {returns x_4685}
    }
  let {defunc_2_reduce_res_4681 : i64} =
    defunc_2_reduce_res_4694[0i64]
  let {lw_val_slice_4686 : [1i64]i64} =
    ys_4680[defunc_2_reduce_res_4681 :+ 1i64 * 1i64]
  let {main_res_4687 : [d₀_4677]i64} =
    xs_4679 with [defunc_2_reduce_res_4681 :+ 1i64 * 1i64] = lw_val_slice_4686
  in {main_res_4687}
}

This requires synchronisation. It might be possible to handle this using essentially the same mechanism as in #1658. Currently the GPUBody construct doesn't support the returns we need (scatters), but we can write it as a SegMap:

entry("main",
      {xs: *direct, ys: direct},
      {*direct})
  entry_main (d₀_4677 : i64, d₁_4678 : i64, xs_4679 : *[d₀_4677]i64,
              ys_4680 : [d₁_4678]i64)
  : {*[d₀_4677]i64} = {
  let {segred_group_size_4690 : i64} =
    get_size(segred_group_size_4689, group_size)
  let {num_groups_4692 : i64} =
    calc_num_groups(d₀_4677, segred_num_groups_4691, segred_group_size_4690)
  let {defunc_2_reduce_res_4694 : [1i64]i64} =
    segred(thread; #groups=num_groups_4692; groupsize=segred_group_size_4690)
    (dummy_4695 < 1i64, gtid_4696 < d₀_4677) (~phys_tid_4697)
    ({0i64},
     ,
     commutative \ {x_4682 : i64, x_4683 : i64}
       : {i64} ->
       let {defunc_1_op_res_4684 : i64} = add64(x_4682, x_4683)
       in {defunc_1_op_res_4684})
    : {i64} {
      let {x_4685 : i64} =
        xs_4679[gtid_4696]
      return {returns x_4685}
    }
  let {main_res_4687 : [d₀_4677]i64} =
    segmap(thread; #groups=1i64; groupsize=1i64)
    (gtid_5000 < 1i64) (~phys_tid_5001) : {i64} {
      let {defunc_2_reduce_res_4681 : i64} =
        defunc_2_reduce_res_4694[0i64]
      let {i_5002 : i64} =
        ys_4680[defunc_2_reduce_res_4681]
      return {xs_4679 : [d₀_4677]
              with ([defunc_2_reduce_res_4681] = i_5002)}
    }
  in {main_res_4687}
}

Two mutually exclusiv ways to go here: change GPUBody to contain a proper KernelBody, or just use one of these single-thread SegMaps instead. Operationally they do exactly the same thing, but the former may be more concise. That was after all the motivation for GPUBody in the first place.

The text was updated successfully, but these errors were encountered:

athas added the optimisation label Jul 2, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Migrate scalar in-place updates to GPU #1665

Migrate scalar in-place updates to GPU #1665

athas commented May 3, 2022 •

edited

Migrate scalar in-place updates to GPU #1665

Migrate scalar in-place updates to GPU #1665

Comments

athas commented May 3, 2022 • edited

athas commented May 3, 2022 •

edited