Skip to content

Commit

Permalink
cuda fix and fix waker_id bug
Browse files Browse the repository at this point in the history
  • Loading branch information
ikbuibui committed Apr 5, 2024
1 parent 897ddf6 commit 169cd22
Show file tree
Hide file tree
Showing 10 changed files with 440 additions and 282 deletions.
152 changes: 80 additions & 72 deletions examples/cuda_mandelbrot.cu
Expand Up @@ -5,31 +5,25 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/

#include <redGrapes/SchedulerDescription.hpp>
#include <redGrapes/dispatch/cuda/cuda_task_properties.hpp>
#include <redGrapes/dispatch/cuda/cuda_worker.hpp>
#include <redGrapes/redGrapes.hpp>
#include <redGrapes/resource/fieldresource.hpp>
#include <redGrapes/resource/ioresource.hpp>
#include <redGrapes/scheduler/cuda_thread_scheduler.hpp>
#include <redGrapes/scheduler/pool_scheduler.hpp>
#include <redGrapes/task/property/resource.hpp>

#include <cuda.h>
#include <cuda_runtime.h>
#include <pngwriter.h>

#include <chrono>
#include <functional>
#include <iomanip>
#include <iostream>

enum SchedulerTag
{
SCHED_CUDA
};

#define REDGRAPES_TASK_PROPERTIES dispatch::cuda::CudaTaskProperties, scheduler::SchedulingTagProperties<SchedulerTag>

#include "redGrapes/dispatch/cuda/scheduler.hpp"
#include <redGrapes/redGrapes.hpp>
#include "redGrapes/resource/fieldresource.hpp"
#include "redGrapes/resource/ioresource.hpp"
#include "redGrapes/scheduler/pool_scheduler.hpp"
#include "redGrapes/scheduler/tag_match.hpp"
#include "redGrapes/task/property/resource.hpp"

namespace rg = redGrapes;

struct Color
{
float r, g, b;
Expand Down Expand Up @@ -70,18 +64,27 @@ __global__ void mandelbrot(
out[index] = Color{cosf(float(i) / 7.0), cosf(2.0 + float(i) / 11.0), cosf(4.0 + float(i) / 13.0)};
}

int main()
struct CudaTag
{
auto pool_scheduler = std::make_shared<rg::scheduler::PoolScheduler>(4 /* number of CPU workers */);
};

using RGTask = redGrapes::Task<redGrapes::dispatch::cuda::CudaTaskProperties>;

auto cuda_scheduler = std::make_shared<rg::dispatch::cuda::CudaScheduler>(
[](rg::Task const& t) { return t.required_scheduler_tags.test(SCHED_CUDA); },
4 /* number of cuda streams */
);
int main()
{
spdlog::set_level(spdlog::level::trace);
spdlog::set_pattern("[thread %t] %^[%l]%$ %v");

rg::idle = [cuda_scheduler] { cuda_scheduler->poll(); };
auto rg = redGrapes::init<redGrapes::dispatch::cuda::CudaTaskProperties>(
redGrapes::SchedulerDescription(
std::make_shared<redGrapes::scheduler::CudaThreadScheduler<RGTask>>(),
CudaTag{}),
redGrapes::SchedulerDescription(
std::make_shared<redGrapes::scheduler::PoolScheduler<redGrapes::dispatch::thread::DefaultWorker<RGTask>>>(
4),
redGrapes::DefaultTag{}));

rg::init(rg::scheduler::make_tag_match_scheduler().add({}, pool_scheduler).add({SCHED_CUDA}, cuda_scheduler));
auto& cudaSched = rg.getScheduler<CudaTag>();

double mid_x = 0.41820187155955555;
double mid_y = 0.32743154895555555;
Expand All @@ -90,10 +93,10 @@ int main()
size_t height = 4096;
size_t area = width * height;

rg::IOResource<Color*> host_buffer;
rg::IOResource<Color*> device_buffer;
redGrapes::IOResource<Color*, RGTask> host_buffer;
redGrapes::IOResource<Color*, RGTask> device_buffer;

rg::emplace_task(
rg.emplace_task(
[area](auto host_buffer)
{
void* ptr;
Expand All @@ -102,7 +105,7 @@ int main()
},
host_buffer.write());

rg::emplace_task(
rg.emplace_task(
[area](auto device_buffer)
{
void* ptr;
Expand All @@ -124,52 +127,55 @@ int main()
/*
* calculate picture
*/
rg::emplace_task(
[width, height, area, i, mid_x, mid_y, w](auto device_buffer)
{
double begin_x = mid_x - w;
double end_x = mid_x + w;
double begin_y = mid_y - w;
double end_y = mid_y + w;

dim3 threadsPerBlock(8, 8);
dim3 numBlocks(width / threadsPerBlock.x, height / threadsPerBlock.y);

mandelbrot<<<numBlocks, threadsPerBlock, 0, rg::dispatch::cuda::current_stream>>>(
begin_x,
end_x,
begin_y,
end_y,
width,
height,
*device_buffer);
std::cout << "launched kernel to stream " << rg::dispatch::cuda::current_stream << std::endl;
},
rg::TaskProperties::Builder().scheduling_tags({SCHED_CUDA}),
device_buffer.write());
rg.emplace_task<CudaTag>(
[width, height, area, i, mid_x, mid_y, w, &cudaSched](auto device_buffer)
{
double begin_x = mid_x - w;
double end_x = mid_x + w;
double begin_y = mid_y - w;
double end_y = mid_y + w;

dim3 threadsPerBlock(8, 8);
dim3 numBlocks(width / threadsPerBlock.x, height / threadsPerBlock.y);

auto current_stream = cudaSched.getCudaStream(0);
mandelbrot<<<numBlocks, threadsPerBlock, 0, cudaSched.getCudaStream(0)>>>(
begin_x,
end_x,
begin_y,
end_y,
width,
height,
*device_buffer);
std::cout << "launched kernel to stream " << current_stream << std::endl;
},
device_buffer.write())
.cuda_stream_index(0u);

/*
* copy data
*/
rg::emplace_task(
[area](auto host_buffer, auto device_buffer)
{
cudaMemcpyAsync(
*host_buffer,
*device_buffer,
area * sizeof(Color),
cudaMemcpyDeviceToHost,
rg::dispatch::cuda::current_stream);
std::cout << "launched memcpy to stream " << rg::dispatch::cuda::current_stream << std::endl;
},
rg::TaskProperties::Builder().scheduling_tags({SCHED_CUDA}),
host_buffer.write(),
device_buffer.read());
rg.emplace_task<CudaTag>(
[area, &cudaSched](auto host_buffer, auto device_buffer)
{
auto current_stream = cudaSched.getCudaStream(0);
cudaMemcpyAsync(
*host_buffer,
*device_buffer,
area * sizeof(Color),
cudaMemcpyDeviceToHost,
current_stream);
std::cout << "launched memcpy to stream " << current_stream << std::endl;
},
host_buffer.write(),
device_buffer.read())
.cuda_stream_index(0u);
;

/*
* write png
*/
rg::emplace_task(
rg.emplace_task(
[width, height, i](auto host_buffer)
{
std::stringstream step;
Expand All @@ -179,9 +185,9 @@ int main()
pngwriter png(width, height, 0, filename.c_str());
png.setcompressionlevel(9);

for(int y = 0; y < height; ++y)
for(size_t y = 0; y < height; ++y)
{
for(int x = 0; x < width; ++x)
for(size_t x = 0; x < width; ++x)
{
auto& color = (*host_buffer)[x + y * width];
png.plot(x + 1, height - y, color.r, color.g, color.b);
Expand All @@ -194,7 +200,7 @@ int main()
host_buffer.read());
}

rg::emplace_task([](auto b) {}, host_buffer.write()).get();
rg.emplace_task([]([[maybe_unused]] auto b) {}, host_buffer.write()).get();

auto t2 = std::chrono::high_resolution_clock::now();
std::cout << "runtime: " << std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() << " μs"
Expand All @@ -203,7 +209,9 @@ int main()
/*
* cleanup
*/
rg::emplace_task([](auto host_buffer) { cudaFreeHost(*host_buffer); }, host_buffer.write());
rg.emplace_task<CudaTag>([](auto host_buffer) { cudaFreeHost(*host_buffer); }, host_buffer.write())
.cuda_stream_index(0u);

rg::emplace_task([](auto device_buffer) { cudaFree(*device_buffer); }, device_buffer.write());
rg.emplace_task<CudaTag>([](auto device_buffer) { cudaFree(*device_buffer); }, device_buffer.write())
.cuda_stream_index(0u);
}
2 changes: 1 addition & 1 deletion redGrapes/TaskCtx.hpp
Expand Up @@ -32,7 +32,7 @@ namespace redGrapes
}
else
{
event->waker_id = event.task->scheduler_p->getNextWorkerID();
event->waker_id = event.task->scheduler_p->getNextWorkerID() + 1;
while(!event->is_reached())
TaskFreeCtx::idle();
}
Expand Down
@@ -1,4 +1,4 @@
/* Copyright 2020 Michael Sippel
/* Copyright 2024 Tapish Narwal
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
Expand All @@ -7,6 +7,8 @@

#pragma once

#include <optional>

namespace redGrapes
{
namespace dispatch
Expand All @@ -16,7 +18,7 @@ namespace redGrapes

struct CudaTaskProperties
{
std::optional<cudaEvent_t> cuda_event;
std::optional<unsigned> m_cuda_stream_idx;

CudaTaskProperties()
{
Expand All @@ -30,6 +32,12 @@ namespace redGrapes
Builder(PropertiesBuilder& b) : builder(b)
{
}

PropertiesBuilder& cuda_stream_index(unsigned cuda_stream_idx)
{
*(builder.task->m_cuda_stream_idx) = cuda_stream_idx;
return builder;
}
};

struct Patch
Expand Down

0 comments on commit 169cd22

Please sign in to comment.