rosa/inc/alsk/executor/impl/firstlevel/noopti.h

130 lines
3.9 KiB
C++

#ifndef ALSK_ALSK_EXECUTOR_IMPL_FIRSTLEVEL_NOOPTI_H
#define ALSK_ALSK_EXECUTOR_IMPL_FIRSTLEVEL_NOOPTI_H
#include <thread>
#include <set>
#include <cmath>
#include <vector>
#include "../../executorbase.h"
#include "../../executorstate.h"
#include "../../../skeleton/traits.h"
namespace alsk {
namespace exec {
template<typename S>
struct FirstLevelNoOpti: ExecutorBase {
using Tag = alsk::tag::Parallel;
public:
struct Info {
unsigned int parDepth;
};
private:
unsigned int threadLimit(unsigned int level) const { return level? 1 : cores; }
public:
template<typename Impl>
std::size_t contextIdCount(Impl&, std::size_t count) { return count; }
template<typename Impl>
std::size_t contextId(Impl&, std::size_t id) { return id; }
template<typename Task, typename Impl, typename BTask, typename Parameters>
void executeParallel(Impl& impl, BTask& task, Parameters const& parameters, std::size_t n) {
auto const& parDepth = impl.executorInfo.parDepth;
std::size_t const maxThreads = threadLimit(parDepth);
std::size_t const nThreads = std::min(n, maxThreads);
if(nThreads > 1) {
Info info{parDepth+1};
std::vector<std::thread> threads(nThreads-1);
std::size_t const step = std::round(static_cast<double>(n)/nThreads);
auto run = [&](std::size_t b, std::size_t k) {
for(std::size_t i = 0; i < k; ++i)
Task::execute(impl, task, b+i, info, parameters, std::tuple<>{});
};
for(std::size_t i = 0; i < nThreads-1; ++i)
threads[i] = std::thread{run, i*step, step};
run((nThreads-1)*step, n-(nThreads-1)*step);
for(std::thread& thread: threads) thread.join();
} else {
Info info{parDepth};
for(std::size_t i = 0; i < n; ++i)
Task::execute(impl, task, i, info, parameters, std::tuple<>{});
}
}
template<typename Value, typename Task, typename Select, typename Impl, typename BTask, typename BSelect, typename Parameters>
Value executeParallelAccumulate(Impl& impl, BTask& task, BSelect& select, Parameters const& parameters, std::size_t n) {
auto const& parDepth = impl.executorInfo.parDepth;
std::size_t const maxThreads = threadLimit(parDepth); // TODO fix neighbours
Value best{};
std::size_t const nThreads = std::min(n, maxThreads);
if(nThreads > 1) {
Info info{parDepth+1};
std::vector<std::thread> threads(nThreads-1);
std::size_t const step = n/nThreads;
std::size_t const remainBase = n - step*nThreads;
std::size_t remain = remainBase;
auto run = [&](Value& out, std::size_t b, std::size_t k) {
Value best{};
if(k)
best = Task::execute(impl, task, b+0, info, parameters, std::tuple<>{});
for(std::size_t i = 1; i < k; ++i) {
Value current = Task::execute(impl, task, b+i, info, parameters, std::tuple<>{});
best = Select::execute(impl, select, b+i, info, parameters, std::tuple<>{}, std::move(current), std::move(best));
}
out = std::move(best);
};
std::size_t start{};
std::vector<Value> bests(nThreads);
for(std::size_t i = 0; i < nThreads-1; ++i) {
std::size_t offset = !!remain;
remain -= offset;
threads[i] = std::thread{run, std::ref(bests[i]), start, step+offset};
start += step+offset;
}
run(bests[nThreads-1], start, step);
for(std::thread& thread: threads) thread.join();
if(nThreads) best = std::move(bests[0]);
for(std::size_t i = 1; i < nThreads; ++i)
best = Select::execute(impl, select, i, info, parameters, std::tuple<>{}, std::move(bests[i]), std::move(best));
} else {
Info info{parDepth};
if(n)
best = Task::execute(impl, task, 0, info, parameters, std::tuple<>{});
for(std::size_t i = 1; i < n; ++i) {
Value current = Task::execute(impl, task, i, info, parameters, std::tuple<>{});
best = Select::execute(impl, select, i, info, parameters, std::tuple<>{}, std::move(current), std::move(best));
}
}
return best;
}
};
template<typename S>
struct ExecutorState<FirstLevelNoOpti<S>> {};
}
}
#endif