New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Performance drop #6177
Comments
@m-diers Marco could you please be more specific how to reproduce this? Would you have a small code reproducing it? |
The performance degradation is noticeable in simple algorithms, such as the ones below.
Note that there was small improvement in some cases (ie for CPU-heavy tasks & smaller input sizes), if that's of any interest. |
@Pansysk75 thanks. Doesn't that show trends opposite to the measurement we did before? This PR binds threads to cores in a way preventing for them to be stolen by other cores, which could be the reason for the performance regression. Could you please try commenting out this code section: hpx/libs/core/algorithms/include/hpx/parallel/util/adapt_thread_priority.hpp Lines 31 to 46 in d05037b
|
Here is the old example with adapted workload. Example#include "hpx/runtime_distributed/find_all_localities.hpp"
#include <hpx/hpx_init.hpp>
#include <hpx/iostream.hpp>
#include <hpx/future.hpp>
#include <hpx/compute_local/host/numa_domains.hpp>
#include <hpx/include/parallel_executors.hpp>
#include <hpx/parallel/algorithms/for_loop.hpp>
namespace hpx::parallel::execution {
struct restricted_thread_pool_executor_v : public restricted_thread_pool_executor {
public:
explicit restricted_thread_pool_executor_v( std::pair<std::size_t, std::size_t> num_pus )
: restricted_thread_pool_executor( num_pus.first,
num_pus.second )
, num_pus_( num_pus ) {
}
auto get_num_pus() const -> std::pair<std::size_t, std::size_t> {
return num_pus_;
}
private:
std::pair<std::size_t, std::size_t> num_pus_;
};
template<>
struct is_one_way_executor<restricted_thread_pool_executor_v> : std::true_type {
};
template<>
struct is_two_way_executor<restricted_thread_pool_executor_v> : std::true_type {
};
template<>
struct is_bulk_two_way_executor<restricted_thread_pool_executor_v> : std::true_type {
};
struct executor_guided_chunk_size {
constexpr explicit executor_guided_chunk_size() = default;
template<typename Executor, typename F>
constexpr std::size_t get_chunk_size( Executor&& exec,
F&& f,
std::size_t cores,
std::size_t num_tasks ) const {
auto corecount( exec.get_num_pus().second );
//auto corecount( 4ul ); // for me 4 or 6, fix it
return ( std::max )( 1ul, ( num_tasks + corecount - 1 ) / corecount );
}
private:
friend class hpx::serialization::access;
template<typename Archive>
void serialize( Archive& /*ar*/,
const unsigned int /*version*/ ) {
}
};
template<>
struct is_executor_parameters<executor_guided_chunk_size>
: std::true_type {
};
}
using Target = std::array<std::size_t, 3>;
using Targets = std::vector<Target>;
class Component6177 : public hpx::components::component_base<Component6177> {
private:
using Grid = std::vector<std::vector<float>>;
static constexpr auto TimeSteps = 5000ul;
static constexpr auto DimX = 540ul;
static constexpr auto DimZ = 230ul;
static constexpr auto DimPad = 6ul;
public:
Component6177() = default;
auto execute( Target target ) -> Target {
hpx::parallel::execution::restricted_thread_pool_executor_v executor{ std::make_pair( target[1], target[2] ) };
Grid grida( DimX, Grid::value_type( DimZ, 1.f ) );
Grid padded( DimX + 2 * DimPad, Grid::value_type( DimZ + 2 * DimPad, 99.f ) );
for( auto t( 0ul ); t < TimeSteps; ++t ) {
hpx::for_loop( hpx::execution::par.on( executor ).with( hpx::parallel::execution::executor_guided_chunk_size{} ),
DimPad, DimX + DimPad,
[&]( auto i ) {
auto sign = i % 2;
for( auto k = DimPad; k < DimZ + DimPad; ++k ) {
grida[i - DimPad][k - DimPad] = 0.5 * padded[i][k] +
0.4 * ( padded[i + 1][k] + sign * padded[i - 1][k] ) +
0.3 * ( padded[i + 2][k] + sign * padded[i - 2][k] ) +
0.2 * ( padded[i + 3][k] + sign * padded[i - 3][k] ) +
0.1 * ( padded[i + 4][k] + sign * padded[i - 4][k] ) +
0.09 * ( padded[i + 5][k] + sign * padded[i - 5][k] ) +
0.08 * ( padded[i + 6][k] + sign * padded[i - 6][k] ) +
0.5 * padded[i][k] +
0.4 * ( padded[i][k + 1] + sign * padded[i][k - 1] ) +
0.3 * ( padded[i][k + 2] + sign * padded[i][k - 2] ) +
0.2 * ( padded[i][k + 3] + sign * padded[i][k - 3] ) +
0.1 * ( padded[i][k + 4] + sign * padded[i][k - 4] ) +
0.09 * ( padded[i][k + 5] + sign * padded[i][k - 5] ) +
0.08 * ( padded[i][k + 6] + sign * padded[i][k - 6] );
}
} );
}
return target;
}
auto targets( std::size_t hint ) const -> Targets {
auto numadomains( hpx::compute::host::numa_domains() );
auto numacount( hpx::compute::host::numa_domains().size() );
auto numasize( hpx::compute::host::numa_domains().front().num_pus().second );
auto executorsize( std::min( hint, numasize ) );
while( numasize % executorsize ) {
++executorsize;
}
Targets targets;
for( auto i( 0u ); i < ( numacount * numasize ); i += executorsize ) {
targets.emplace_back( Target{ hpx::get_locality_id(), i, executorsize } );
}
return targets;
}
HPX_DEFINE_COMPONENT_ACTION( Component6177, execute );
HPX_DEFINE_COMPONENT_ACTION( Component6177, targets );
};
HPX_REGISTER_COMPONENT( hpx::components::component<Component6177>, Component6177 );
HPX_REGISTER_ACTION( Component6177::execute_action );
HPX_REGISTER_ACTION( Component6177::targets_action );
class Component6177Client : public hpx::components::client_base<Component6177Client, Component6177> {
using BaseType = hpx::components::client_base<Component6177Client, Component6177>;
public:
template<typename... Arguments>
explicit Component6177Client( Arguments... arguments )
: BaseType( std::move( arguments )... ) {
}
template<typename... Arguments>
auto execute( Arguments... arguments ) -> hpx::future<Target> {
return hpx::async<Component6177::execute_action>( this->get_id(), std::move( arguments )... );
}
template<typename... Arguments>
auto targets( Arguments... arguments ) -> hpx::future<Targets> {
return hpx::async<Component6177::targets_action>( this->get_id(), std::move( arguments )... );
}
};
int hpx_main() {
std::vector<Component6177Client> clients;
auto localities( hpx::find_all_localities() );
std::transform( std::begin( localities ), std::end( localities ),
std::back_inserter( clients ),
[]( auto& loc ) {
return hpx::new_<Component6177Client>( loc );
} );
std::vector<hpx::future<Targets>> targets;
for( auto& client : clients ) {
targets.emplace_back( client.targets( 4ul ) );
}
std::vector<hpx::future<Target>> results;
std::for_each( std::begin( targets ), std::end( targets ),
[&]( auto&& target ) {
for( auto&& subtarget : target.get() ) {
results.emplace_back( clients[subtarget[0]].execute( subtarget ) );
}
} );
for( auto counter( results.size() ); counter < 25ul; ++counter ) {
hpx::wait_any( results );
auto res( std::find_if( std::begin( results ), std::end( results ), []( auto& result ) { return result.has_value(); } ) );
auto result( res->get() );
*res = clients[result[0]].execute( result );
std::cout << "Shot " << counter << " on " << result[0] << " " << result[1] << ":" << result[2] << std::endl;
}
hpx::wait_all( results );
return hpx::finalize();
}
int main( int argc, char* argv[] ) {
return hpx::init( argc, argv );
}
Unfortunately, I also get a compilation error with the commit I mentioned. I have entered my fixed value there for testing. The executor and chunk_size must still be transferred. Runtime:
|
@hkaiser Application runtime:
|
6158: Update documentation in `writing single-node applications` page r=hkaiser a=dimitraka - Rename LCOs to Synchronization Objects - List all synchronization objects and add documentation for all - barrier - condition variable - latch - mutex - shared mutex - semaphore - guards - List all execution control objects and add documentation for all - future - channel - task block - task group - thread - Update sections: - Extended facilities for futures - Using parallel algorithms - Parallel algorithms 6181: LCI parcelport: backlog queue, aggregation, separate devices, and more r=hkaiser a=JiakunYan More optimizations for LCI parcelport. - Add a TLS message backlog queue for unsucceeded sends and eager message aggregation. Use option hpx.parcel.lci.backlog_queue to control (default is 0). - Let users be able to control progress thread pinning. Use option hpx.parcel.lci.prg_thread_core to control. Only valid if hpx.parcel.lci.rp_prg_pool=0 - Use LCI_mbuffer_alloc for eager message - Use separate devices/progress threads for eager/iovec messages. Use option hpx.parcel.lci.use_two_device to control (default is 0). - Change the default value of hpx.parcel.lci.sendimm to 1, a.k.a bypass parcel queues and connection caches by default. 6182: Fixing performance regressions r=hkaiser a=hkaiser Working towards fixing #6177 Co-authored-by: kadimitra <kadimitra@ece.auth.gr> Co-authored-by: dimitraka <kadimitra@ece.auth.gr> Co-authored-by: Jiakun Yan <jiakunyan1998@gmail.com> Co-authored-by: Hartmut Kaiser <hartmut.kaiser@gmail.com>
Expected Behavior
No performance drop
Actual Behavior
Performance drop of 25% with the changes of
6f16e77
Steps to Reproduce the Problem
Use of
restricted_thread_pool_executor
andexecutor_guided_chunk_size
as in my old examples in #5117.Specifications
The text was updated successfully, but these errors were encountered: