#ifndef ALSK_ALSK_EXECUTOR_IMPL_STATICPOOL_H #define ALSK_ALSK_EXECUTOR_IMPL_STATICPOOL_H #include #include #include #include #include #include #include "../executorbase.h" #include "../executorstate.h" #include "../../skeleton/traits.h" #include "../utility/staticpool.h" namespace alsk { namespace exec { template struct StaticPool: ExecutorBase { using Tag = alsk::tag::Parallel; public: struct Info { std::size_t cores; std::size_t offset; Info(std::size_t cores = 0, std::size_t offset = 0): cores{cores}, offset{offset} {} }; private: auto buildSplitFor(S& s, std::size_t cores) { std::map ms; auto populateSplitImpl = [&ms, totalCores=cores]( auto& buildSplitImpl, auto& s, std::size_t maxThreads, std::size_t thOffset, std::size_t n, std::size_t idStep, std::size_t id, bool isRemainder ) { std::size_t const nThreads = std::min(n, maxThreads); if(nThreads > 0) { std::size_t const step = n/nThreads; std::size_t const remainBase = n - step*nThreads; std::size_t remain = remainBase; std::size_t const coresA = maxThreads/nThreads; std::size_t const coresB = remainBase? maxThreads/remainBase : 1; std::size_t start = 0; for(std::size_t i = 0; i < nThreads; ++i) { std::size_t thNum = thOffset + i*coresA; std::size_t offset = !!remain; remain -= offset; if(!ms.count(id+start*idStep)) ms[id+start*idStep] = thNum; for(std::size_t j = 0; j < step; ++j) buildSplitImpl(s, coresA, thNum, id+(start+j)*idStep, false); if(offset) buildSplitImpl(s, coresB, thNum, id+(start+step)*idStep, true); start += step+offset; } if(isRemainder) ms[id+start*idStep] = totalCores; } else { for(std::size_t i = 0; i < n; ++i) buildSplitImpl(s, maxThreads, thOffset, id+i*idStep, false); } }; auto buildSplitImpl = makeRecursiveLambda( [&populateSplitImpl]( auto buildSplitImpl, auto& s, auto maxThreads, auto thOffset, auto id, bool isRemainder ) { auto idStep = skeletonStep(s); auto populateSplit = [&](auto& s, std::size_t n) { if(!idStep) return; populateSplitImpl(buildSplitImpl, s, maxThreads, thOffset, n, idStep, id, isRemainder); }; skeletonTraversal(s, populateSplit); } ); buildSplitImpl(s, cores, 0ul, 0ul, false); return ms; } template void buildSplit(Impl& impl) { typename Impl::State& state = impl.state; auto& split = state.executor.split; split.clear(); split.insert(0); for(auto cores: repeatability.coresList) { std::size_t curThread = 0; for(auto p: buildSplitFor(impl.skeleton, cores)) { // TODO: C++17 if(std::get<1>(p) != curThread) { curThread = std::get<1>(p); split.insert(std::get<0>(p)); } } } } std::size_t threadLimit(Info const& info) const { auto const& lCores = info.cores; return lCores? lCores : cores; } public: template void config(Impl& impl) { typename Impl::State& state = impl.state; impl.executorInfo.cores = cores; impl.executorInfo.offset = 0; state.executor.config(cores); state.executor.parTasksCount = impl.parallelTasksCount();; buildSplit(impl); } template std::size_t contextIdCount(Impl& impl, std::size_t) { typename Impl::State& state = impl.state; return state.executor.split.size(); } template std::size_t contextId(Impl& impl, std::size_t id) { // O(log(n)) typename Impl::State& state = impl.state; auto& split = state.executor.split; return std::distance(std::begin(split), split.upper_bound(id)) - 1; } template void executeParallel(Impl& impl, BTask& task, Parameters const& parameters, std::size_t n) { std::size_t const maxThreads = threadLimit(impl.executorInfo); std::size_t const nThreads = std::min(n, maxThreads); if(nThreads > 0) { std::vector> futures(nThreads); std::size_t const step = n/nThreads; std::size_t const remainBase = n - step*nThreads; std::size_t remain = remainBase; std::size_t const coresA = maxThreads/nThreads; // cores for sub tasks in main cases std::size_t const coresB = remainBase? maxThreads/remainBase : 1; // cores for remaining tasks typename Impl::State& state = impl.state; auto run = [&](std::size_t b, std::size_t k, bool offset, std::size_t thOffset) { Info infoA{coresA, thOffset}, infoB{coresB, thOffset}; std::size_t i; for(i = 0; i < k; ++i) Task::execute(impl, task, b+i, infoA, parameters, std::tuple<>{}); if(offset) Task::execute(impl, task, b+i, infoB, parameters, std::tuple<>{}); }; for(std::size_t i = 0, start = 0; i < nThreads; ++i) { std::size_t thNum = impl.executorInfo.offset + i*coresA; std::size_t offset = !!remain; remain -= offset; auto task = [&run, start, step, offset, thNum]{ run(start, step, offset, thNum); }; futures[i] = state.executor.run(thNum, std::move(task)); start += step+offset; } state.executor.wait(futures); } else { Info info{impl.executorInfo}; for(std::size_t i = 0; i < n; ++i) Task::execute(impl, task, i, info, parameters, std::tuple<>{}); } } template Value executeParallelAccumulate(Impl& impl, BTask& task, BSelect& select, Parameters const& parameters, std::size_t n) { std::size_t const maxThreads = threadLimit(impl.executorInfo); Value best{}; std::size_t const nThreads = std::min(n, maxThreads); if(nThreads > 0) { std::vector> futures(nThreads); std::size_t const step = n/nThreads; std::size_t const remainBase = n - step*nThreads; std::size_t remain = remainBase; std::size_t const coresA = maxThreads/nThreads; // cores for sub tasks in main cases std::size_t const coresB = remainBase? maxThreads/remainBase : 1; // cores for remaining tasks typename Impl::State& state = impl.state; auto run = [&](Value& out, std::size_t b, std::size_t k, bool offset, std::size_t thOffset) { Value best{}; Info infoA{coresA, thOffset}, infoB{coresB, thOffset}; if(k) { best = Task::execute(impl, task, b+0, infoA, parameters, std::tuple<>{}); std::size_t i; for(i = 1; i < k; ++i) { Value current = Task::execute(impl, task, b+i, infoA, parameters, std::tuple<>{}); best = Select::execute(impl, select, b+i, infoA, parameters, std::tuple<>{}, std::move(current), std::move(best)); } if(offset) { Value current = Task::execute(impl, task, b+i, infoB, parameters, std::tuple<>{}); best = Select::execute(impl, select, b+i, infoB, parameters, std::tuple<>{}, std::move(current), std::move(best)); } } out = std::move(best); }; std::vector bests(nThreads); for(std::size_t i = 0, start = 0; i < nThreads; ++i) { std::size_t thNum = impl.executorInfo.offset + i*coresA; std::size_t offset = !!remain; remain -= offset; auto task = [&, &best=bests[i], start, step, offset, thNum]{ run(best, start, step, offset, thNum); }; futures[i] = state.executor.run(thNum, std::move(task)); start += step+offset; } state.executor.wait(futures); if(nThreads) best = std::move(bests[0]); for(std::size_t i = 1; i < nThreads; ++i) best = Select::execute(impl, select, i, impl.executorInfo, parameters, std::tuple<>{}, std::move(bests[i]), std::move(best)); } else { Info info{impl.executorInfo}; if(n) best = Task::execute(impl, task, 0, info, parameters, std::tuple<>{}); for(std::size_t i = 1; i < n; ++i) { Value current = Task::execute(impl, task, i, info, parameters, std::tuple<>{}); best = Select::execute(impl, select, i, info, parameters, std::tuple<>{}, std::move(current), std::move(best)); } } return best; } }; template struct ExecutorState>: util::StaticPool { std::size_t parTasksCount; std::set split; }; } } #endif