Files
samples/pipelineTracker.hpp

376 lines
14 KiB
C++

/*
* Copyright (c) 2026 Mykhailo Mamedov. All rights reserved.
*
* RESEARCH PREVIEW / REFERENCE ONLY:
* This source code is provided solely for the purpose of reviewing
* the author's research methods and implementation.
*
* NO LICENSE GRANTED:
* This code is NOT for distribution, modification, or use in any
* project (commercial or otherwise). Unauthorized copying or
* use of this code is strictly prohibited.
*
* For inquiries regarding use or licensing, contact: ua.modin@gmail.com
*/
#ifndef WORKLOAD_HEADER
#define WORKLOAD_HEADER
#include <functional>
#include <unordered_map>
#include <vulkan/vulkan.h>
#include <array>
#include <vector>
#include <cstdint>
namespace M_CORE {
struct Submission2;
}
namespace M_CORE {
namespace cfgPipe {
inline constexpr auto MAX_QUERYS_PER_DEVICE = 32U;
inline constexpr auto ONE_MICROSECOND = 1e-6f;
}
/**
* @brief Structured configuration defining a single pipe stage's execution environment.
*/
struct StageConfig {
mDevice* deviceSrc = nullptr;
mDevice* deviceDst = nullptr;
const char* stageName = nullptr;
};
struct CrossDeviceLink {
mDevice* srcDevice = nullptr;
VkSemaphore srcSemaphore = nullptr;
mDevice* dstDevice = nullptr;
VkSemaphore dstSemaphore = nullptr;
u64 lastMirroredValue = 0; // The tracking floor for THIS cross-device link.
};
struct StageContext {
StageConfig config;
sem localTimelineSem;
u64 lastProducerValue = 0;
u64 m_gpuExecutionCounter = 0;
u32 readIdx = 0;
float lastTime = 0;
float deltaTime = 0;
VkQueryPool pool = nullptr;
};
struct StageMetrics {
u32 stageCount = 0;
VkQueryPool pool = nullptr;
};
/**
* @brief Coordinates multi-stage workload pipelining and data flow across
* independent Vulkan devices using single-threaded host synchronization.
*
* @details This class decouples multi-GPU synchronization from driver-dependent
* external semaphore extensions. Instead of relying on native hardware casting—
* which frequently fails or deadlocks across different GPU architectures—this
* tracker isolates execution onto device-local timeline semaphores.
*
* Cross-device data boundaries are linked safely via a synchro,
* once-per-frame MirrorSignals() call executed on the main CPU loop. This function
* peeks at the raw execution states of producer semaphores and monotonically
* advances consumer landing pads by exactly one step per frame.
*/
class VulkanPipelineTracker {
public:
VulkanPipelineTracker(mCore* m_core, const std::vector<StageConfig>& stages, u32 bufferCount=3)
: m_stageCount(static_cast<u32>(stages.size())), m_bufferCount(bufferCount)
{
m_stageContexts.resize(m_stageCount);
std::unordered_map<VkDevice, StageMetrics> countsMap;
for (u32 s = 0; s < m_stageCount; ++s) {
std::string semName = std::format("pipeLocal_S{}_{}", s, stages[s].stageName);
StageContext& stageContext = m_stageContexts[s];
stageContext.config = stages[s];
semaphoreInfo info = {
stageContext.config.deviceSrc,
VkSemaphoreType::VK_SEMAPHORE_TYPE_TIMELINE,
0, // Initial value.
semName.c_str()
};
if (stageContext.config.deviceDst && stageContext.config.deviceDst->virtualDevice) {
info.flags = SEM_FLAGS::RELAY;
info.remoteDevice = stageContext.config.deviceDst;
}
m_core->createSemaphore(info, stageContext.localTimelineSem);
if (stageContext.localTimelineSem.semaphoreRemote) {
AddCrossDeviceLink(
stageContext.config.deviceSrc,
stageContext.localTimelineSem.semaphore,
stageContext.config.deviceDst,
stageContext.localTimelineSem.semaphoreRemote, 0);
}
countsMap[stageContext.config.deviceSrc->virtualDevice].stageCount++;
}
/**
* @brief Create a query pool for each device.
* Required for performance metrics for every stage.
*/
for (auto& [device, metrics] : countsMap) {
CreateQueryStagePool(device, cfgPipe::MAX_QUERYS_PER_DEVICE, metrics.pool);
assert(metrics.pool);
}
// Link query pool to stages.
for (u32 s = 0; s < m_stageCount; ++s) {
StageContext& stageContext = m_stageContexts[s];
stageContext.pool = countsMap.at(stageContext.config.deviceSrc->virtualDevice).pool;
}
}
~VulkanPipelineTracker() {
for (u32 s = 0; s < m_stageCount; ++s) {
delete& m_stageContexts[s].localTimelineSem; // TODO: Add context destructor.
}
}
/**
* @brief Registers a cross-device pipeline handoff link.
*/
void AddCrossDeviceLink(mDevice* srcDev, VkSemaphore srcSem, mDevice* dstDev, VkSemaphore dstSem, u64 initialFloorValue = 0) {
CrossDeviceLink link{};
link.srcDevice = srcDev;
link.srcSemaphore = srcSem;
link.dstDevice = dstDev;
link.dstSemaphore = dstSem;
link.lastMirroredValue = initialFloorValue;
m_links.push_back(link);
}
/**
* @brief Processes all registered cross-device signals.
* @details Call this exactly ONCE per CPU frame loop tick (for now).
* It increments the destination timeline by exactly 1 step per frame.
*/
void MirrorSignals() {
for (size_t i = 0; i < m_links.size(); ++i) {
CrossDeviceLink& link = m_links[i];
// Peek at the live source GPU execution state (Instant, non-blocking check)
u64 currentSrcValue = 0;
VkResult srcRes = vkGetSemaphoreCounterValue(link.srcDevice->virtualDevice, link.srcSemaphore, &currentSrcValue);
CHECK_VK_RESULT(srcRes, "vkGetSemaphoreCounterValue\n");
// If the source has jumped ahead, advance the target by exactly 1 single step
if (currentSrcValue > link.lastMirroredValue) {
link.lastMirroredValue++; // (e.g., 1 -> 2)
VkSemaphoreSignalInfo signalInfo{ VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO };
signalInfo.semaphore = link.dstSemaphore;
signalInfo.value = link.lastMirroredValue;
// Safe CPU signal. Because it only fires once per frame, it will never thrash the WDDM driver kernel.
VkResult res = vkSignalSemaphore(link.dstDevice->virtualDevice, &signalInfo);
CHECK_VK_RESULT(res, "vkSignalSemaphore\n");
}
}
}
/**
* @brief Says True if scheduled workload has been completed.
* Also True if no workload was ever scheduled.
*/
bool IsStageComplete(u32 stageId, u64& timeLineValue) {
u64 currentSelfValue = m_stageContexts[stageId].m_gpuExecutionCounter;
if (currentSelfValue > 0) {
u64 currentGPUValue = 0;
VkResult res = vkGetSemaphoreCounterValue(
m_stageContexts[stageId].config.deviceSrc->virtualDevice,
m_stageContexts[stageId].localTimelineSem.semaphore, &currentGPUValue);
CHECK_VK_RESULT(res, "vkGetSemaphoreCounterValue\n");
timeLineValue = currentGPUValue;
if (currentSelfValue != currentGPUValue)
return false;
}
return true;
}
/**
* @brief temporary time counter. To be replaced with vKQueryPool
* DEPRECATED
*/
void UpdateTime(u32 stageId, float currentTime) {
StageContext* context = GetStageContext(stageId);
if (!context->lastTime) {
context->lastTime = currentTime;
}
else {
context->deltaTime = currentTime - context->lastTime;
context->lastTime = currentTime;
}
}
/**
* @brief Create timestamp query pool for specific device.
* Each device needs a pool.
*/
void CreateQueryStagePool(VkDevice device, u32 stageCount, VkQueryPool& resultPool) {
VkQueryPoolCreateInfo poolInfo = {};
poolInfo.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO;
poolInfo.queryType = VK_QUERY_TYPE_TIMESTAMP;
poolInfo.queryCount = stageCount * 2; // One for start, one for end
VkResult res = vkCreateQueryPool(device, &poolInfo, nullptr, &resultPool);
CHECK_VK_RESULT(res, "vkCreateQueryPool\n");
}
/**
* @brief A wrapper function designed to encapsulate the boilerplate
* of executing Vulkan commands while automatically measuring their actual duration on GPU.
*/
void recordTimedCommands(
VkCommandBuffer cmdBuffer,
StageContext* context,
u32 stageId,
std::function<void()> recordCommands)
{
u32 queryStart = stageId * 2;
u32 queryEnd = queryStart + 1;
// Reset start and end queries
vkCmdResetQueryPool(cmdBuffer, context->pool, queryStart, 2);
// Write Start Timestamp
// BOTTOM_OF_PIPE ?
vkCmdWriteTimestamp(cmdBuffer, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, context->pool, queryStart);
// User-provided command recording
recordCommands();
// Write End Timestamp
vkCmdWriteTimestamp(cmdBuffer, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, context->pool, queryEnd);
}
/**
* @brief Calculates the exact duration of a GPU operation stage in milliseconds
* by computing the difference between its recorded start and end timestamps.
* TODO: Move away.
*/
float getStageDurationMs(StageContext* context, u32 stageId) {
u64 results[2];
// Retrieve timestamp ticks (64-bit)
VkResult res = vkGetQueryPoolResults(
context->config.deviceSrc->virtualDevice, context->pool,
stageId * 2, 2,
sizeof(results), results,
sizeof(u64), VK_QUERY_RESULT_64_BIT
);
CHECK_VK_RESULT(res, "vkGetQueryPoolResults\n");
// Convert to milliseconds.
u64 ticksDelta = results[1] - results[0];
return static_cast<float>(ticksDelta)
* context->config.deviceSrc->phyDeviceData.m_devProps.limits.timestampPeriod
* cfgPipe::ONE_MICROSECOND;
}
/**
* @brief Says True if target index is not in progress by stage specified.
* says true if that index in finished also.
*/
bool IsIndexSafeToWrite(u32 stageId, u32 indexTarget) {
u64 stageScheduledValue = m_stageContexts[stageId].m_gpuExecutionCounter;
if (stageScheduledValue > 0) {
u64 currentGPUValue = 0;
VkResult res = vkGetSemaphoreCounterValue(
m_stageContexts[stageId].config.deviceSrc->virtualDevice,
m_stageContexts[stageId].localTimelineSem.semaphore,
&currentGPUValue);
CHECK_VK_RESULT(res, "vkGetSemaphoreCounterValue\n");
u32 scheduledWriteIdx = stageScheduledValue % m_bufferCount;
u32 activeReadIdx = m_stageContexts[stageId].readIdx;
u32 latestFinishedStageIndex = currentGPUValue % m_bufferCount;
if (indexTarget == activeReadIdx && scheduledWriteIdx != latestFinishedStageIndex) {
// The last scheduled index matches the target, and it wasn't finished yet by the stage specified.
return false;
}
}
return true;
}
/**
* @brief Returns previous finished index by stage.
*/
u32 getSafeIndex(u32 stageId, u64& timeLineValue) {
u64 currentGPUValue = 0;
VkResult res = vkGetSemaphoreCounterValue(
m_stageContexts[stageId].config.deviceSrc->virtualDevice,
m_stageContexts[stageId].localTimelineSem.semaphore,
&currentGPUValue);
CHECK_VK_RESULT(res, "vkGetSemaphoreCounterValue\n");
if (currentGPUValue > 0) {
// Take previous index.
u32 index = currentGPUValue % m_bufferCount;
timeLineValue = currentGPUValue;
return index;
}
}
/**
* @brief Provides semaphore object for the device configured
* and a new timeline value to signal on submission and reference to schedule counter.
* Stage supposed to call this when previous workload confirmed finished in IsStageComplete()
*
*/
void AdvanceStageSync(u32 stageId, u64& targetValue, sem*& semObject, u32 readIdxTarget) {
m_stageContexts[stageId].m_gpuExecutionCounter++;
m_stageContexts[stageId].readIdx = readIdxTarget;
targetValue = m_stageContexts[stageId].m_gpuExecutionCounter;
semObject = &m_stageContexts[stageId].localTimelineSem;
}
StageContext* GetStageContext(u32 stageId) {
return &m_stageContexts[stageId];
}
VkDevice GetStageDevice(u32 stageId) {
assert(stageId < m_stageContexts.size() && "Wrong ID requested.");
return m_stageContexts[stageId].config.deviceSrc->virtualDevice;
}
private:
u32 m_stageCount;
u32 m_bufferCount;
std::vector<CrossDeviceLink> m_links;
std::vector<StageContext> m_stageContexts;
};
} // namespace
#endif // WORKLOAD_HEADER
#ifdef WORKLOAD_COMPONENT
namespace M_CORE {
} // namespace
#endif