diff --git a/CMakeLists.txt b/CMakeLists.txt index 716d996..bd41892 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.28.0) +cmake_minimum_required(VERSION 3.20...3.28) project(hex-processor) @@ -30,13 +30,88 @@ set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH}) option(USE_VERILATOR "Use Verilator for simulation" ON) option(BUILD_DOCS "Create and install HTML documentation" OFF) -# Verilator +# Verilator. Prefer a sufficiently recent system install; otherwise fetch and +# build a pinned version from source (needs host tools: autoconf, flex, bison). if(USE_VERILATOR) - find_package(verilator HINTS $ENV{VERILATOR_ROOT} ${VERILATOR_ROOT}) - if(NOT verilator_FOUND) - message(FATAL_ERROR "Verilator was not found.") + set(VERILATOR_GIT_TAG "v5.048" + CACHE STRING "Verilator version to fetch/build when no system install is suitable") + set(VERILATOR_MIN_VERSION "5.0" + CACHE STRING "Minimum acceptable system Verilator version") + + # First try a system install (honouring VERILATOR_ROOT). The version floor + # rejects too-old installs (e.g. 4.x lacks the VerilatedContext API hextb + # uses), so the source build below takes over. + find_package(verilator ${VERILATOR_MIN_VERSION} QUIET + HINTS $ENV{VERILATOR_ROOT} ${VERILATOR_ROOT}) + + if(verilator_FOUND) + message(STATUS "Using system Verilator ${verilator_VERSION} (>= ${VERILATOR_MIN_VERSION})") + else() + set(VERILATOR_INSTALL_DIR "${CMAKE_BINARY_DIR}/verilator") + set(VERILATOR_ROOT "${VERILATOR_INSTALL_DIR}/share/verilator") + + # Build once; subsequent configures reuse the installed tree. + if(NOT EXISTS "${VERILATOR_ROOT}/verilator-config.cmake") + message(STATUS "No system Verilator >= ${VERILATOR_MIN_VERSION}; " + "fetching and building ${VERILATOR_GIT_TAG} " + "(one-time, needs autoconf/flex/bison)...") + FetchContent_Declare( + verilator + GIT_REPOSITORY https://github.com/verilator/verilator.git + GIT_TAG ${VERILATOR_GIT_TAG} + GIT_SHALLOW TRUE) + FetchContent_GetProperties(verilator) + if(NOT verilator_POPULATED) + FetchContent_Populate(verilator) + endif() + + include(ProcessorCount) + ProcessorCount(NPROC) + if(NPROC EQUAL 0) + set(NPROC 1) + endif() + + # Verilator uses an autoconf build; a git checkout has no generated + # configure, so run autoconf first. + execute_process(COMMAND autoconf + WORKING_DIRECTORY ${verilator_SOURCE_DIR} + RESULT_VARIABLE VL_AUTOCONF) + if(VL_AUTOCONF) + message(FATAL_ERROR + "Could not run 'autoconf' to build Verilator. Install the host build " + "tools first: autoconf, flex, bison (e.g. " + "'apt-get install autoconf flex bison'), then reconfigure.") + endif() + execute_process(COMMAND ${verilator_SOURCE_DIR}/configure + --prefix=${VERILATOR_INSTALL_DIR} + WORKING_DIRECTORY ${verilator_SOURCE_DIR} + RESULT_VARIABLE VL_CONFIGURE) + if(VL_CONFIGURE) + message(FATAL_ERROR "Verilator ./configure failed (need flex and bison).") + endif() + # Build only the binaries (verilator_exe) and install without man pages, + # so help2man is not required. + execute_process(COMMAND make -j${NPROC} verilator_exe + WORKING_DIRECTORY ${verilator_SOURCE_DIR} + RESULT_VARIABLE VL_MAKE) + if(VL_MAKE) + message(FATAL_ERROR "Verilator build (make) failed.") + endif() + execute_process(COMMAND make installbin installredirect installdata + install-msg + WORKING_DIRECTORY ${verilator_SOURCE_DIR} + RESULT_VARIABLE VL_INSTALL) + if(VL_INSTALL) + message(FATAL_ERROR "Verilator install failed.") + endif() + endif() + + # Clear any cached binary (e.g. an earlier system-Verilator probe) so the + # config resolves verilator_bin against the fetched install. + unset(VERILATOR_BIN CACHE) + find_package(verilator REQUIRED HINTS ${VERILATOR_ROOT}) + message(STATUS "Using fetched Verilator ${VERILATOR_GIT_TAG} from ${VERILATOR_ROOT}") endif() - message(STATUS "Found Verilator binary = ${VERILATOR_BIN}") endif() # Compiler config @@ -71,20 +146,38 @@ install(TARGETS hexasm xcmp xrun hexsim hexdis # Verilator if(USE_VERILATOR) add_executable(hextb hex.cpp hextb.cpp) + target_link_libraries(hextb PUBLIC fmt::fmt) verilate( hextb - TRACE - VERILATOR_ARGS - --top-module - hex + PREFIX Vntb + TOP_MODULE network_top + VERILATOR_ARGS -Wall -Wno-UNUSEDPARAM -Wno-UNUSEDSIGNAL SOURCES verilog/hex_pkg.sv - verilog/hex.sv + verilog/memory.sv verilog/processor.sv - verilog/memory.sv) + verilog/link_interface.sv + verilog/core.sv + verilog/router.sv + verilog/network_top.sv) install(TARGETS hextb DESTINATION ${CMAKE_INSTALL_BINDIR}) + + add_executable(router_tb tests/rtl/router_tb.cpp) + verilate(router_tb PREFIX Vrouter SOURCES verilog/hex_pkg.sv verilog/router.sv + TOP_MODULE router VERILATOR_ARGS -Wall -Wno-UNUSEDPARAM) + + add_executable(liu_tb tests/rtl/liu_tb.cpp) + verilate(liu_tb SOURCES verilog/hex_pkg.sv verilog/link_interface.sv + PREFIX Vlink_interface TOP_MODULE link_interface + VERILATOR_ARGS -Wall -Wno-UNUSEDPARAM -Wno-UNUSEDSIGNAL) + + add_executable(core_tb tests/rtl/core_tb.cpp) + verilate(core_tb SOURCES verilog/hex_pkg.sv verilog/memory.sv + verilog/processor.sv verilog/link_interface.sv verilog/core.sv + PREFIX Vcore TOP_MODULE core + VERILATOR_ARGS -Wall -Wno-UNUSEDPARAM -Wno-UNUSEDSIGNAL) endif() enable_testing() diff --git a/hexcontainer.hpp b/hexcontainer.hpp new file mode 100644 index 0000000..b9d2d8a --- /dev/null +++ b/hexcontainer.hpp @@ -0,0 +1,88 @@ +#ifndef HEX_CONTAINER_HPP +#define HEX_CONTAINER_HPP + +#include +#include +#include +#include +#include + +//===---------------------------------------------------------------------===// +// Reader for the HEXN network container format, shared by the C++ simulator +// (hexsim) and the Verilator testbench (hextb) so they cannot drift. +// +// Layout (little-endian): +// uint32 magic = 0x4E584548 ("HEXN") +// uint32 numProcessors +// uint32 numEdges +// edges[numEdges]: uint32 procA, slotA, procB, slotB +// images[numProcessors]: uint32 imageSizeBytes, then imageSizeBytes of a +// standard single-image binary (size-word + code + debug info). +// +// A file without the magic is treated as a single plain image. +//===---------------------------------------------------------------------===// + +namespace hexcontainer { + +const uint32_t MAGIC = 0x4E584548; // "HEXN" + +struct Edge { + uint32_t procA, slotA, procB, slotB; +}; + +struct Container { + bool isNetwork = false; // false => single plain image + std::vector edges; // channel wiring (network only) + std::vector> images; // per-processor image bytes +}; + +inline uint32_t readU32(std::istream &file) { + uint32_t value = 0; + file.read(reinterpret_cast(&value), sizeof(uint32_t)); + return value; +} + +/// Read a container file. If it lacks the HEXN magic, returns a single-image +/// container (isNetwork = false, one image holding the whole file). +inline Container read(const std::string &filename) { + std::ifstream file(filename, std::ios::binary); + if (!file) { + throw std::runtime_error("could not open file: " + filename); + } + file.seekg(0, std::ios::end); + auto fileSize = static_cast(file.tellg()); + file.seekg(0, std::ios::beg); + + Container container; + uint32_t magic = readU32(file); + if (magic != MAGIC) { + // Single plain image: the whole file is one image. + file.seekg(0, std::ios::beg); + std::vector image(fileSize); + file.read(image.data(), fileSize); + container.images.push_back(std::move(image)); + return container; + } + + container.isNetwork = true; + uint32_t numProcessors = readU32(file); + uint32_t numEdges = readU32(file); + container.edges.resize(numEdges); + for (auto &e : container.edges) { + e.procA = readU32(file); + e.slotA = readU32(file); + e.procB = readU32(file); + e.slotB = readU32(file); + } + for (uint32_t i = 0; i < numProcessors; i++) { + uint32_t imageSize = readU32(file); + std::vector image(imageSize); + file.read(image.data(), imageSize); + container.images.push_back(std::move(image)); + } + return container; +} + +} // namespace hexcontainer + +#endif // HEX_CONTAINER_HPP diff --git a/hexsim.hpp b/hexsim.hpp index bb15ddc..571732e 100644 --- a/hexsim.hpp +++ b/hexsim.hpp @@ -16,6 +16,7 @@ #include #include "hex.hpp" +#include "hexcontainer.hpp" #include "hexsimio.hpp" namespace hexsim { @@ -522,52 +523,17 @@ class System { /// Load a network container, or fall back to a single-processor system if the /// file is a plain image (no network magic). void loadNetwork(const char *filename) { - std::ifstream file(filename, std::ios::binary); - if (!file) { - throw std::runtime_error(std::string("could not open file: ") + filename); - } - file.seekg(0, std::ios::end); - auto fileSize = file.tellg(); - file.seekg(0, std::ios::beg); - - uint32_t magic; - file.read(reinterpret_cast(&magic), sizeof(uint32_t)); - if (magic != NETWORK_MAGIC) { - // Plain single image: rewind and load the whole file as one processor. - file.seekg(0, std::ios::beg); - addProcessor(file, static_cast(fileSize), 0); - return; - } - - uint32_t numProcessors; - uint32_t numEdges; - file.read(reinterpret_cast(&numProcessors), sizeof(uint32_t)); - file.read(reinterpret_cast(&numEdges), sizeof(uint32_t)); - - struct Edge { - uint32_t procA, slotA, procB, slotB; - }; - std::vector edges(numEdges); - for (auto &e : edges) { - file.read(reinterpret_cast(&e.procA), sizeof(uint32_t)); - file.read(reinterpret_cast(&e.slotA), sizeof(uint32_t)); - file.read(reinterpret_cast(&e.procB), sizeof(uint32_t)); - file.read(reinterpret_cast(&e.slotB), sizeof(uint32_t)); - } - - // Read each embedded image into its own processor. - for (uint32_t i = 0; i < numProcessors; i++) { - uint32_t imageSize; - file.read(reinterpret_cast(&imageSize), sizeof(uint32_t)); - std::vector buffer(imageSize); - file.read(buffer.data(), imageSize); - std::istringstream imageStream(std::string(buffer.begin(), buffer.end()), + auto container = hexcontainer::read(filename); + // One processor per image (a plain single image yields one processor). + for (size_t i = 0; i < container.images.size(); i++) { + auto &image = container.images[i]; + std::istringstream imageStream(std::string(image.begin(), image.end()), std::ios::binary); - addProcessor(imageStream, imageSize, i); + addProcessor(imageStream, static_cast(image.size()), + static_cast(i)); } - // Wire up the channels. - for (auto &e : edges) { + for (auto &e : container.edges) { auto channel = std::make_unique(); procs[e.procA]->setLink(e.slotA, channel.get()); procs[e.procB]->setLink(e.slotB, channel.get()); diff --git a/hextb.cpp b/hextb.cpp index a383602..2685967 100644 --- a/hextb.cpp +++ b/hextb.cpp @@ -3,134 +3,198 @@ #include #include #include -#include #include +#include #include -#include "Vhex_pkg.h" -#include "Vhex_pkg_hex.h" -#include "Vhex_pkg_memory.h" -#include "Vhex_pkg_processor.h" +#include "Vntb.h" +#include "Vntb_core.h" +#include "Vntb_memory.h" +#include "Vntb_network_top.h" +#include "Vntb_processor.h" #include "hex.hpp" +#include "hexcontainer.hpp" #include "hexsimio.hpp" double sc_time_stamp() { return 0; } -constexpr size_t RESET_BEGIN = 1; -constexpr size_t RESET_END = 10; - hex::HexSimIO io(std::cin, std::cout); -void load(const char *filename, const std::unique_ptr &top) { - - // Load the binary file. - std::streampos fileSize; - std::ifstream file(filename, std::ios::binary); - - // Get length of file. - file.seekg(0, std::ios::end); - fileSize = file.tellg(); - file.seekg(0, std::ios::beg); - - // Check the file length matches. - unsigned remainingFileSize = static_cast(fileSize) - 4; - remainingFileSize = - (remainingFileSize + 3U) & ~3U; // Round up to multiple of 4. - unsigned programSize; - file.read(reinterpret_cast(&programSize), 4); - programSize <<= 2; - if (programSize != remainingFileSize) { - std::cerr << fmt::format("Warning: mismatching program size {} != {}\n", - programSize, remainingFileSize); +// Must match hex_pkg::NUM_CORES in the SystemVerilog. +constexpr unsigned NUM_CORES = 4; +static_assert(NUM_CORES == 4, + "coreOf() enumerates exactly 4 cores by their generate-loop " + "names; extend it if NUM_CORES changes."); + +// A two-instruction loop (NFIX F; BR E) that spins in place forever (BR E adds +// -2, returning to the NFIX) without any syscall or channel op, keeping cores +// with no image quiescent. +constexpr uint32_t HALT_LOOP = 0x00009EFFu; + +// Reach core k. network_top instantiates the cores in a generate loop, so +// Verilator exposes them under mangled names (g_core[k].u_core); the names are +// stable because the Verilator version is pinned. +static Vntb_core *coreOf(const std::unique_ptr &top, unsigned k) { + switch (k) { + case 0: return top->network_top->g_core__BRA__0__KET____DOT__u_core; + case 1: return top->network_top->g_core__BRA__1__KET____DOT__u_core; + case 2: return top->network_top->g_core__BRA__2__KET____DOT__u_core; + case 3: return top->network_top->g_core__BRA__3__KET____DOT__u_core; + default: throw std::runtime_error("core index out of range"); } +} - // Read the file contents. - std::vector buffer(remainingFileSize); - file.read(reinterpret_cast(buffer.data()), remainingFileSize); - - // Write program to DUT memory. - std::memcpy(top->hex->u_memory->memory_q.data(), buffer.data(), - buffer.size()); - - std::cout << "Wrote " << programSize << " bytes to memory\n"; +static IData *memOf(const std::unique_ptr &top, unsigned k) { + return coreOf(top, k)->u_memory->memory_q.data(); } -void handleSyscall(hex::Syscall syscall, const std::unique_ptr &top, - int &exitCode, bool trace) { - unsigned spWordIndex = top->hex->u_memory->memory_q[1]; +// Service one core's syscall, reading arguments from its own memory. +static void handleSyscall(unsigned k, hex::Syscall syscall, + const std::unique_ptr &top, int &exitValue, + bool &exited) { + IData *mem = memOf(top, k); + unsigned sp = mem[1]; switch (syscall) { case hex::Syscall::EXIT: - exitCode = top->hex->u_memory->memory_q[spWordIndex + 2]; - if (trace) { - std::cout << fmt::format("exit {}\n", exitCode); - } + exitValue = mem[sp + 2]; + exited = true; break; - case hex::Syscall::WRITE: { - char value = top->hex->u_memory->memory_q[spWordIndex + 2]; - int stream = top->hex->u_memory->memory_q[spWordIndex + 3]; - if (trace) { - std::cout << fmt::format("output({}, {})\n", value, stream); - } - io.output(value, stream); + case hex::Syscall::WRITE: + io.output(static_cast(mem[sp + 2]), static_cast(mem[sp + 3])); break; - } - case hex::Syscall::READ: { - int stream = top->hex->u_memory->memory_q[spWordIndex + 2]; - if (trace) { - std::cout << fmt::format("input({})\n", stream); - } - // Truncated inputs (ie not sign extended). - top->hex->u_memory->memory_q[spWordIndex + 1] = io.input(stream) & 0xFF; + case hex::Syscall::READ: + mem[sp + 1] = io.input(mem[sp + 2]) & 0xFF; break; - } default: throw std::runtime_error("invalid syscall"); } } -int run(const std::unique_ptr &contextp, - const std::unique_ptr &top, bool trace, size_t maxCycles) { - uint64_t cycle_count = 0; - int exitCode = 0; +// Read the container, fill every core with the halt loop, and load the images +// into the active cores. Returns the parsed container (its edges are used to +// program the route tables). +static hexcontainer::Container load(const char *filename, + const std::unique_ptr &top) { + auto container = hexcontainer::read(filename); + unsigned numActive = container.images.size(); + if (numActive > NUM_CORES) { + throw std::runtime_error("container has more processors than NUM_CORES"); + } - // Set input signals - top->i_rst = 0; + // Quiescent default for every core. + for (unsigned k = 0; k < NUM_CORES; k++) { + memOf(top, k)[0] = HALT_LOOP; + } + + // Load each image's code into its core's memory. + for (unsigned i = 0; i < numActive; i++) { + auto &image = container.images[i]; + uint32_t programSizeWords; + std::memcpy(&programSizeWords, image.data(), 4); + std::memcpy(memOf(top, i), image.data() + 4, programSizeWords << 2); + } + std::cout << fmt::format("Loaded {} processor image(s)\n", numActive); + return container; +} + +// Program one route-table entry: core->slot maps to (dstCore, dstSlot). Driven +// over one clock with the config write enable asserted. +static void writeRoute(const std::unique_ptr &ctx, + const std::unique_ptr &top, unsigned core, + unsigned slot, unsigned dstCore, unsigned dstSlot) { + top->i_cfg_we = 1; + top->i_cfg_core = core; + top->i_cfg_slot = slot; + top->i_cfg_dst_core = dstCore; + top->i_cfg_dst_slot = dstSlot; + ctx->timeInc(1); top->i_clk = 0; top->eval(); + ctx->timeInc(1); top->i_clk = 1; top->eval(); + top->i_cfg_we = 0; +} + +int run(const std::unique_ptr &ctx, + const std::unique_ptr &top, const char *filename, bool trace, + size_t maxCycles) { top->i_clk = 0; + top->i_cfg_we = 0; + top->i_rst = 1; + top->eval(); + + auto container = load(filename, top); + unsigned numActive = container.images.size(); + + // Hold reset for a few cycles, then program the routing tables (config writes + // are independent of reset). + for (int i = 0; i < 4; i++) { + ctx->timeInc(1); top->i_clk = 0; top->eval(); + ctx->timeInc(1); top->i_clk = 1; top->eval(); + } + for (auto &e : container.edges) { + writeRoute(ctx, top, e.procA, e.slotA, e.procB, e.slotB); + writeRoute(ctx, top, e.procB, e.slotB, e.procA, e.slotA); + } + top->i_rst = 0; - while (!contextp->gotFinish() && - (maxCycles > 0 ? cycle_count <= maxCycles : true)) { - contextp->timeInc(1); - // Toggle the clock. - top->i_clk = !top->i_clk; - // Assert reset initially. - if (top->i_clk) { - if (contextp->time() > RESET_BEGIN && contextp->time() < RESET_END) { - top->i_rst = 1; // Assert reset - } else { - top->i_rst = 0; // Deassert reset + std::vector exited(numActive, false); + std::vector prevPc(numActive, 0xFFFFFFFFu); + unsigned numExited = 0; + int exitCode = 0; + bool haveExit = false; + uint64_t cycles = 0; + // A channel rendezvous legitimately freezes the participating cores' PCs for + // a few cycles while the flit/ACK traverse the routers. Only declare deadlock + // after a sustained stretch with no PC change and no syscall anywhere. + unsigned noProgress = 0; + constexpr unsigned DEADLOCK_THRESHOLD = 256; + + while (numExited < numActive && (maxCycles == 0 || cycles <= maxCycles)) { + // Falling then rising edge. + ctx->timeInc(1); top->i_clk = 0; top->eval(); + ctx->timeInc(1); top->i_clk = 1; top->eval(); + cycles++; + + bool progressed = false; + for (unsigned k = 0; k < numActive; k++) { + if (exited[k]) { + continue; + } + // Syscalls. + if ((top->o_syscall_valid >> k) & 1) { + bool justExited = false; + int exitValue = 0; + handleSyscall(k, static_cast(top->o_syscall[k]), top, + exitValue, justExited); + progressed = true; + if (justExited) { + exited[k] = true; + numExited++; + if (!haveExit) { + exitCode = exitValue; // first EXIT sets the system exit code + haveExit = true; + } + } + } + // Progress detection for deadlock. + unsigned pc = coreOf(top, k)->u_processor->pc_q; + if (pc != prevPc[k]) { + progressed = true; + } + prevPc[k] = pc; + if (trace) { + std::cout << fmt::format("[{:6}] core{} pc={}\n", cycles, k, pc); } } - // Evaluate the design. - top->eval(); - if (top->i_clk) { - cycle_count++; - } - // Trace - if (trace && top->i_clk && contextp->time() > RESET_END) { - auto instr = instrEnumToStr( - static_cast((top->hex->u_processor->instr >> 4) & 0xF)); - std::cout << fmt::format( - "[{:6}] {:6} {:#04x} {:6}\n", contextp->time(), - top->hex->u_processor->pc_q, - static_cast(top->hex->u_processor->instr), instr); - } - // Handle syscalls - if (top->i_clk && top->o_syscall_valid) { - auto syscall = static_cast(top->o_syscall); - handleSyscall(syscall, top, exitCode, trace); - if (syscall == hex::Syscall::EXIT) { - break; + + noProgress = progressed ? 0 : noProgress + 1; + if (noProgress > DEADLOCK_THRESHOLD && numExited < numActive) { + std::string msg = "deadlock: cores blocked:"; + for (unsigned k = 0; k < numActive; k++) { + if (!exited[k]) { + msg += fmt::format(" {}", k); + } } + top->final(); + throw std::runtime_error(msg); } } @@ -139,20 +203,16 @@ int run(const std::unique_ptr &contextp, } static void help(const char **argv) { - std::cout << "Hex processor testbench\n\n"; + std::cout << "Hex multi-core processor testbench\n\n"; std::cout << "Usage: " << argv[0] << " file\n\n"; - std::cout << "Positional arguments:\n"; - std::cout << " file A binary file to execute\n\n"; - std::cout << "Optional arguments:\n"; + std::cout << " file A binary or network container to execute\n"; std::cout << " -h,--help Display this message\n"; - std::cout << " -t,--trace Enable instruction tracing\n"; - std::cout << " --max-cycles N Limit the number of simulation cycles " - "(default: 0)\n"; + std::cout << " -t,--trace Enable per-core PC tracing\n"; + std::cout << " --max-cycles N Limit simulation cycles (default: 0)\n"; } int main(int argc, const char **argv) { try { - // Handle arguments. const char *filename = nullptr; bool trace = false; size_t maxCycles = 0; @@ -167,28 +227,21 @@ int main(int argc, const char **argv) { } else if (std::strcmp(argv[i], "--max-cycles") == 0) { maxCycles = std::stoull(argv[++i]); } else if (argv[i][0] == '+') { - // Skip plusargs. continue; + } else if (!filename) { + filename = argv[i]; } else { - // Positional argument. - if (!filename) { - filename = argv[i]; - } else { - throw std::runtime_error("cannot specify more than one file"); - } + throw std::runtime_error("cannot specify more than one file"); } } - // Setup TB and DUT. - Verilated::mkdir("logs"); - const std::unique_ptr contextp{new VerilatedContext}; - contextp->debug(0); - contextp->randReset(2); - contextp->traceEverOn(true); - contextp->commandArgs(argc, argv); - const std::unique_ptr top{new Vhex_pkg{contextp.get(), "TOP"}}; - // Run. - load(filename, top); - return run(contextp, top, trace, maxCycles); + if (!filename) { + help(argv); + return 1; + } + const std::unique_ptr ctx{new VerilatedContext}; + ctx->commandArgs(argc, argv); + const std::unique_ptr top{new Vntb{ctx.get(), "TOP"}}; + return run(ctx, top, filename, trace, maxCycles); } catch (std::exception &e) { std::cerr << "Error: " << e.what() << "\n"; return 1; diff --git a/tests/rtl/core_tb.cpp b/tests/rtl/core_tb.cpp new file mode 100644 index 0000000..437dd72 --- /dev/null +++ b/tests/rtl/core_tb.cpp @@ -0,0 +1,67 @@ +#include +#include "Vcore.h" +#include "Vcore_core.h" +#include "Vcore_memory.h" +#include "Vcore_processor.h" +#include "flit_layout.hpp" +#include +#include +#include + +static void tick(Vcore *d) { d->i_clk = 0; d->eval(); d->i_clk = 1; d->eval(); } + +int main() { + Vcore *d = new Vcore; + + // Program (byte addresses): + // 0: PFIX 4 (0xE4) 1: LDAC 1 (0x31) -> areg = 0x41 = 65 ('A') + // 2: LDBC 0 (0x40) -> breg = 0 (channel slot 0) + // 3: OPR OUT (0xD5) -> send areg on channel 0 + // 4: NFIX F (0xFF) 5: BR F (0x9F) -> loop in place + d->i_rst = 1; d->i_clk = 0; d->eval(); + d->core->u_memory->memory_q[0] = 0xD54031E4u; + d->core->u_memory->memory_q[1] = 0x00009FFFu; + + d->i_core_id = 2; + // Router is always ready to accept injected flits; nothing delivered yet. + d->i_dnet_in_ready = 1; + d->i_anet_in_ready = 1; + d->i_dnet_valid = 0; + d->i_anet_valid = 0; + d->i_cfg_we = 0; + + tick(d); tick(d); d->i_rst = 0; + + // Configure route slot 0 -> (dst_core=1, dst_slot=0). + d->i_cfg_we = 1; d->i_cfg_slot = 0; d->i_cfg_dst_core = 1; d->i_cfg_dst_slot = 0; + tick(d); + d->i_cfg_we = 0; + + // Run until the OUT injects a DATA flit (o_dnet_valid), or time out. + int guard = 0; + while (!d->o_dnet_valid && guard++ < 50) tick(d); + assert(d->o_dnet_valid && "core never injected a DATA flit"); + + // Check the emitted flit. + assert(d->o_dnet_dst == 1 && "wrong destination core"); + assert(flit_dst_core(d->o_dnet_flit) == 1 && "flit dst_core wrong"); + assert(flit_dst_slot(d->o_dnet_flit) == 0 && "flit dst_slot wrong"); + assert(flit_src_core(d->o_dnet_flit) == 2 && "flit src_core wrong"); + assert(flit_word(d->o_dnet_flit) == 65 && "flit word wrong"); + + // The writer must stall (PC frozen) until the ACK returns. + unsigned pc_stalled = d->core->u_processor->pc_q; + tick(d); tick(d); + assert(d->core->u_processor->pc_q == pc_stalled && "PC advanced before ACK"); + + // Return the ACK; the writer should then unblock and advance. + d->i_anet_valid = 1; + tick(d); + d->i_anet_valid = 0; + tick(d); + assert(d->core->u_processor->pc_q != pc_stalled && "core did not unblock after ACK"); + + printf("core_tb PASS\n"); + delete d; + return 0; +} diff --git a/tests/rtl/flit_layout.hpp b/tests/rtl/flit_layout.hpp new file mode 100644 index 0000000..5e08814 --- /dev/null +++ b/tests/rtl/flit_layout.hpp @@ -0,0 +1,27 @@ +#ifndef HEX_RTL_FLIT_LAYOUT_HPP +#define HEX_RTL_FLIT_LAYOUT_HPP + +#include + +// data_flit_t packed layout (38 bits, MSB..LSB), exposed by Verilator as a +// 64-bit QData: +// [37:36] dst_core [35:34] dst_slot [33:32] src_core [31:0] word +// ack_flit_t is just [1:0] dst_core (CData). +// +// Shared by the link-interface and core testbenches so the bit offsets live in +// one place. + +inline uint64_t make_dnet_flit(uint32_t dst_core, uint32_t dst_slot, + uint32_t src_core, uint32_t word) { + return (static_cast(dst_core & 0x3) << 36) | + (static_cast(dst_slot & 0x3) << 34) | + (static_cast(src_core & 0x3) << 32) | + static_cast(word); +} + +inline uint32_t flit_word(uint64_t f) { return static_cast(f); } +inline uint32_t flit_src_core(uint64_t f) { return (f >> 32) & 0x3; } +inline uint32_t flit_dst_slot(uint64_t f) { return (f >> 34) & 0x3; } +inline uint32_t flit_dst_core(uint64_t f) { return (f >> 36) & 0x3; } + +#endif // HEX_RTL_FLIT_LAYOUT_HPP diff --git a/tests/rtl/liu_tb.cpp b/tests/rtl/liu_tb.cpp new file mode 100644 index 0000000..db672dd --- /dev/null +++ b/tests/rtl/liu_tb.cpp @@ -0,0 +1,183 @@ +#include +#include "Vlink_interface.h" +#include "flit_layout.hpp" +#include +#include +#include + +// One clock edge. +static void tick(Vlink_interface *d) { d->i_clk = 0; d->eval(); d->i_clk = 1; d->eval(); } + +int main() { + Vlink_interface *d = new Vlink_interface; + + // ----------------------------------------------------------------------- + // Test 1: OUT — inject DATA flit and wait for ACK. + // Config: slot 0 -> (dst_core=2, dst_slot=1) + // ----------------------------------------------------------------------- + { + // Reset. + d->i_rst = 1; + d->i_clk = 0; + d->i_core_id = 0; + d->i_op_out = 0; + d->i_op_in = 0; + d->i_slot = 0; + d->i_areg = 0; + d->i_cfg_we = 0; + d->i_dnet_in_ready = 0; + d->i_dnet_valid = 0; + d->i_dnet_flit = 0; + d->i_anet_in_ready = 0; + d->i_anet_valid = 0; + d->i_anet_flit = 0; + tick(d); tick(d); + d->i_rst = 0; + + // Config: slot 0 -> (dst_core=2, dst_slot=1). + d->i_cfg_we = 1; + d->i_cfg_slot = 0; + d->i_cfg_dst_core = 2; + d->i_cfg_dst_slot = 1; + tick(d); + d->i_cfg_we = 0; + + // Drive OUT op on slot 0, areg=0x55. + d->i_op_out = 1; + d->i_slot = 0; + d->i_areg = 0x55; + d->i_dnet_in_ready = 1; // network ready immediately + tick(d); // IDLE->OUT_SEND transition; outputs live from now + + // FSM is now in OUT_SEND; combinational outputs valid. + d->eval(); + assert(d->o_busy && "Test1: should be busy in OUT_SEND"); + assert(d->o_dnet_valid && "Test1: o_dnet_valid should be high"); + assert(flit_dst_core(d->o_dnet_flit) == 2 && "Test1: dst_core should be 2"); + assert(flit_dst_slot(d->o_dnet_flit) == 1 && "Test1: dst_slot should be 1"); + assert(flit_src_core(d->o_dnet_flit) == 0 && "Test1: src_core should be 0"); + assert(flit_word(d->o_dnet_flit) == 0x55 && "Test1: word should be 0x55"); + assert(d->o_dnet_dst == 2 && "Test1: o_dnet_dst should be 2"); + + // Tick: i_dnet_in_ready=1 so OUT_SEND->OUT_WAIT. + tick(d); + d->i_dnet_in_ready = 0; + + // Drive ACK arrival. + d->i_anet_valid = 1; + d->i_anet_flit = 0; // dst_core=0 (this core) + d->eval(); + // In OUT_WAIT with i_anet_valid: o_done pulses, o_busy drops. + assert(d->o_done && "Test1: o_done should pulse when ACK arrives"); + assert(!d->o_busy && "Test1: o_busy should drop when o_done"); + + // Tick: OUT_WAIT->IDLE. + tick(d); + d->i_anet_valid = 0; + d->i_op_out = 0; + printf("Test 1 (OUT inject + ACK): PASS\n"); + } + + // ----------------------------------------------------------------------- + // Test 2: IN — wait for slot, deliver word, emit ACK. + // ----------------------------------------------------------------------- + { + // Fresh reset. + d->i_rst = 1; + d->i_op_out = 0; + d->i_op_in = 0; + d->i_dnet_valid = 0; + d->i_anet_valid = 0; + tick(d); tick(d); + d->i_rst = 0; + + // Drive IN on slot 3 (buffer empty => stay IDLE, busy). + d->i_op_in = 1; + d->i_slot = 3; + d->eval(); + assert(d->o_busy && "Test2: should be busy"); + assert(!d->o_done && "Test2: no o_done yet, buffer empty"); + + tick(d); // IDLE (rx_valid[3]=0), stays IDLE + assert(d->o_busy && "Test2: still busy after tick"); + assert(!d->o_done && "Test2: still no o_done"); + + // Deliver DATA flit: dst_slot=3, src_core=1, word=0x99. + // o_dnet_out_ready should be high (slot 3 buffer empty) when valid is driven. + d->i_dnet_valid = 1; + d->i_dnet_flit = make_dnet_flit(/*dst_core=*/0, /*dst_slot=*/3, + /*src_core=*/1, /*word=*/0x99); + d->eval(); + assert(d->o_dnet_out_ready && "Test2: o_dnet_out_ready should be high (slot 3 empty)"); + tick(d); + // Flit accepted on this clock edge: rx_valid[3] set. + // FSM was IDLE and checked rx_valid[3] on this posedge — still 0 (registered). + // So FSM stays IDLE this tick; need one more tick to see rx_valid[3]=1 -> IN_ACK. + d->i_dnet_valid = 0; + tick(d); // IDLE sees rx_valid[3]=1, transitions to IN_ACK. + + // Now in IN_ACK. + d->eval(); + assert(d->o_anet_valid && "Test2: o_anet_valid should be high in IN_ACK"); + assert(d->o_anet_dst == 1 && "Test2: o_anet_dst should be 1"); + assert((d->o_anet_flit & 0x3) == 1 && "Test2: ACK flit dst_core should be 1"); + assert(!d->o_done && "Test2: o_done not yet, anet_in_ready not set"); + + // Drive i_anet_in_ready; o_done pulses, o_in_word=0x99. + d->i_anet_in_ready = 1; + d->eval(); + assert(d->o_done && "Test2: o_done should pulse on anet_in_ready"); + assert(!d->o_busy && "Test2: o_busy should drop with o_done"); + assert(d->o_in_word == 0x99 && "Test2: o_in_word should be 0x99"); + + tick(d); + d->i_anet_in_ready = 0; + d->i_op_in = 0; + printf("Test 2 (IN wait + deliver + ACK): PASS\n"); + } + + // ----------------------------------------------------------------------- + // Test 3: Per-slot independence — deliver to slot 1 while waiting on slot 0. + // ----------------------------------------------------------------------- + { + // Fresh reset. + d->i_rst = 1; + d->i_op_out = 0; + d->i_op_in = 0; + d->i_dnet_valid = 0; + d->i_anet_valid = 0; + d->i_anet_in_ready = 0; + tick(d); tick(d); + d->i_rst = 0; + + // Drive IN on slot 0 (buffer empty => stay IDLE, busy). + d->i_op_in = 1; + d->i_slot = 0; + tick(d); // IDLE, rx_valid[0]=0, stays IDLE + assert(d->o_busy && "Test3: busy waiting on slot 0"); + assert(!d->o_done && "Test3: no done for slot 0"); + + // Deliver a DATA flit for slot 1 (different from the awaited slot 0). + // o_dnet_out_ready should be high (slot 1 buffer empty) when valid is driven. + d->i_dnet_valid = 1; + d->i_dnet_flit = make_dnet_flit(/*dst_core=*/0, /*dst_slot=*/1, + /*src_core=*/2, /*word=*/0xAB); + d->eval(); + assert(d->o_dnet_out_ready && "Test3: ready to accept flit for slot 1"); + tick(d); + // Flit accepted into slot 1 buffer; FSM still IDLE (slot 0 still empty). + d->i_dnet_valid = 0; + + // Slot 0 IN still waiting; no spurious done or ACK. + d->eval(); + assert(d->o_busy && "Test3: still busy waiting on slot 0"); + assert(!d->o_done && "Test3: slot 0 still not filled"); + assert(!d->o_anet_valid && "Test3: no ACK emitted (slot 0 not filled)"); + + printf("Test 3 (per-slot independence): PASS\n"); + } + + printf("liu_tb PASS\n"); + delete d; + return 0; +} diff --git a/tests/rtl/router_tb.cpp b/tests/rtl/router_tb.cpp new file mode 100644 index 0000000..e20c502 --- /dev/null +++ b/tests/rtl/router_tb.cpp @@ -0,0 +1,50 @@ +#include +#include "Vrouter.h" +#include +#include + +// One clock edge (low then high). +static void tick(Vrouter *d) { d->i_clk = 0; d->eval(); d->i_clk = 1; d->eval(); } + +int main() { + Vrouter *d = new Vrouter; + d->i_rst = 1; tick(d); tick(d); d->i_rst = 0; + // Hold output 2 stalled so the delivered flit is held (and observable), which + // also exercises output back-pressure: the flit must not be lost or + // overwritten while i_out_ready is low. + d->i_out_ready = 0x0; + // Inject a flit on input 0 addressed to output 2. + d->i_in_valid = 0x1; + d->i_in_dst[0] = 2; + d->i_in_flit[0] = 0xABCD; + tick(d); // load input buffer + d->i_in_valid = 0x0; + tick(d); // grant -> output register (held, output not ready) + tick(d); // still held while stalled + assert((d->o_out_valid & (1 << 2)) && "flit not delivered/held at output 2"); + assert(d->o_out_flit[2] == 0xABCD && "wrong flit payload"); + // Now drain it. + d->i_out_ready = 0xF; tick(d); + assert(!(d->o_out_valid & (1 << 2)) && "output 2 did not drain when ready"); + + // Fairness: inputs 0 and 1 both target output 3; both must drain (no starvation). + d->i_rst = 1; tick(d); tick(d); d->i_rst = 0; + d->i_out_ready = 0xF; + d->i_in_valid = 0x3; + d->i_in_dst[0] = 3; d->i_in_flit[0] = 0x11; + d->i_in_dst[1] = 3; d->i_in_flit[1] = 0x22; + bool seen11 = false, seen22 = false; + for (int i = 0; i < 8; i++) { + tick(d); + d->i_in_valid = 0x0; // injected once; let them flow through + if (d->o_out_valid & (1 << 3)) { + if (d->o_out_flit[3] == 0x11) seen11 = true; + if (d->o_out_flit[3] == 0x22) seen22 = true; + } + } + assert(seen11 && seen22 && "round-robin starved an input"); + + printf("router_tb PASS\n"); + delete d; + return 0; +} diff --git a/tests/tests.py b/tests/tests.py index 958172e..26c5b82 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -98,6 +98,27 @@ def test_x_compiler_verilator(self): else: pass + def run_message_passing(self, filename, expected): + # Compile a message-passing program (xcmp) to a network container once, + # then run that same container on the C++ simulator and, under + # Verilator, on the RTL -- a golden cross-check that both agree. + src = os.path.join(defs.X_TEST_SRC_PREFIX, filename) + subprocess.run([CMP_BINARY, src, "-o", "net.bin"]) + sim = subprocess.run([SIM_BINARY, "net.bin"], capture_output=True) + self.assertTrue(sim.stdout.decode("utf-8") == expected) + if defs.USE_VERILATOR: + tb = subprocess.run([VTB_BINARY, "net.bin"], capture_output=True) + self.assertTrue(tb.stdout.decode("utf-8").endswith(expected)) + + def test_message_passing_pipe(self): + self.run_message_passing("pipe.x", "P") + + def test_message_passing_pingpong(self): + self.run_message_passing("pingpong.x", "X") + + def test_message_passing_ring(self): + self.run_message_passing("ring.x", "Z") + if __name__ == "__main__": unittest.main() diff --git a/tests/unit/x_features.cpp b/tests/unit/x_features.cpp index 5d9939d..989bbdf 100644 --- a/tests/unit/x_features.cpp +++ b/tests/unit/x_features.cpp @@ -1617,3 +1617,15 @@ TEST_CASE("message_passing_run_pipe_x_file") { REQUIRE(ctx.runXProgramFile(ctx.getXTestPath("pipe.x")) == 0); REQUIRE(ctx.simOutBuffer.str() == "P"); } + +TEST_CASE("message_passing_run_pingpong_x_file") { + TestContext ctx; + REQUIRE(ctx.runXProgramFile(ctx.getXTestPath("pingpong.x")) == 0); + REQUIRE(ctx.simOutBuffer.str() == "X"); +} + +TEST_CASE("message_passing_run_ring_x_file") { + TestContext ctx; + REQUIRE(ctx.runXProgramFile(ctx.getXTestPath("ring.x")) == 0); + REQUIRE(ctx.simOutBuffer.str() == "Z"); +} diff --git a/tests/x/pingpong.x b/tests/x/pingpong.x new file mode 100644 index 0000000..3dbc1c2 --- /dev/null +++ b/tests/x/pingpong.x @@ -0,0 +1,22 @@ +| Two processors exchanging a value: the pinger sends a value to the ponger, +| which echoes it back, and the pinger prints what returned ('X' = 88). + +val put = 1; + +proc putval(val c) is put(c, 0) + +proc pinger(chan out, chan in) is + var v; + { out ! 88; | send to the ponger + in ? v; | receive the echo + putval(v) } | print it + +proc ponger(chan in, chan out) is + var v; + { in ? v; | receive from the pinger + out ! v } | echo it back + +proc main() is + chan a; + chan b; + par { pinger(a, b); ponger(a, b) } diff --git a/tests/x/pipe.x b/tests/x/pipe.x index 67d650a..38b27a8 100644 --- a/tests/x/pipe.x +++ b/tests/x/pipe.x @@ -1,3 +1,8 @@ +| A three-stage pipeline on three processors connected by two channels: +| source -> relay -> sink. The source emits a character, the relay forwards +| it, and the sink prints it ('P'). Demonstrates par, chan formals and the +| ! / ? channel operators. + val put = 1; proc putval(val c) is put(c, 0) diff --git a/tests/x/ring.x b/tests/x/ring.x new file mode 100644 index 0000000..160b925 --- /dev/null +++ b/tests/x/ring.x @@ -0,0 +1,25 @@ +| A token ring of three processors. The starter injects a token which is +| passed around the ring by two reused forwarder processes and returns to the +| starter, which prints it ('Z' = 90). Demonstrates proc reuse: the same +| forwarder runs on two processors with different channel arguments. + +val put = 1; + +proc putval(val c) is put(c, 0) + +proc starter(chan out, chan in) is + var v; + { out ! 90; | inject the token + in ? v; | wait for it to return around the ring + putval(v) } | print it + +proc forwarder(chan in, chan out) is + var v; + { in ? v; | receive from the previous node + out ! v } | pass to the next node + +proc main() is + chan a; + chan b; + chan c; + par { starter(a, c); forwarder(a, b); forwarder(b, c) } diff --git a/verilog/core.sv b/verilog/core.sv new file mode 100644 index 0000000..0b6c559 --- /dev/null +++ b/verilog/core.sv @@ -0,0 +1,119 @@ +// A single processor core: processor + private memory + per-core link +// interface, exposing the syscall interface, the route-table config port, and +// the DATA/ACK network ports for connection to the router(s). +module core + ( + input logic i_clk, + input logic i_rst, + input hex_pkg::core_id_t i_core_id, + // Syscall interface + output logic o_syscall_valid, + output hex_pkg::syscall_t o_syscall, + // Route-table config (reset-time). + input logic i_cfg_we, + input hex_pkg::slot_t i_cfg_slot, + input hex_pkg::core_id_t i_cfg_dst_core, + input hex_pkg::slot_t i_cfg_dst_slot, + // DATA network. + output logic o_dnet_valid, + output hex_pkg::core_id_t o_dnet_dst, + output hex_pkg::data_flit_t o_dnet_flit, + input logic i_dnet_in_ready, + input logic i_dnet_valid, + input hex_pkg::data_flit_t i_dnet_flit, + output logic o_dnet_out_ready, + // ACK network. + output logic o_anet_valid, + output hex_pkg::core_id_t o_anet_dst, + output hex_pkg::ack_flit_t o_anet_flit, + input logic i_anet_in_ready, + input logic i_anet_valid, + input hex_pkg::ack_flit_t i_anet_flit, + output logic o_anet_out_ready + ); + + // Memory fetch/data ports. + logic req_f_valid; + hex_pkg::iaddr_t req_f_addr; + hex_pkg::instr_t res_f_data; + logic req_d_valid; + logic req_d_we; + hex_pkg::waddr_t req_d_addr; + hex_pkg::data_t req_d_data; + hex_pkg::data_t res_d_data; + + // Processor <-> link interface. + logic op_out; + logic op_in; + hex_pkg::slot_t chan_slot; + hex_pkg::data_t chan_areg; + logic liu_busy; + logic liu_done; + hex_pkg::data_t liu_in_word; + + processor u_processor ( + .i_rst (i_rst), + .i_clk (i_clk), + .o_f_valid (req_f_valid), + .o_f_addr (req_f_addr), + .i_f_data (res_f_data), + .o_d_valid (req_d_valid), + .o_d_we (req_d_we), + .o_d_addr (req_d_addr), + .o_d_data (req_d_data), + .i_d_data (res_d_data), + .o_syscall_valid (o_syscall_valid), + .o_syscall (o_syscall), + .o_op_out (op_out), + .o_op_in (op_in), + .o_chan_slot (chan_slot), + .o_chan_areg (chan_areg), + .i_liu_busy (liu_busy), + .i_liu_in_word (liu_in_word) + ); + + memory u_memory ( + .i_rst (i_rst), + .i_clk (i_clk), + .i_f_valid (req_f_valid), + .i_f_addr (req_f_addr), + .o_f_data (res_f_data), + .i_d_valid (req_d_valid), + .i_d_addr (req_d_addr), + .i_d_we (req_d_we), + .i_d_data (req_d_data), + .o_d_data (res_d_data) + ); + + link_interface u_liu ( + .i_clk (i_clk), + .i_rst (i_rst), + .i_core_id (i_core_id), + .i_op_out (op_out), + .i_op_in (op_in), + .i_slot (chan_slot), + .i_areg (chan_areg), + .o_busy (liu_busy), + .o_done (liu_done), + .o_in_word (liu_in_word), + .i_cfg_we (i_cfg_we), + .i_cfg_slot (i_cfg_slot), + .i_cfg_dst_core (i_cfg_dst_core), + .i_cfg_dst_slot (i_cfg_dst_slot), + .o_dnet_valid (o_dnet_valid), + .o_dnet_dst (o_dnet_dst), + .o_dnet_flit (o_dnet_flit), + .i_dnet_in_ready (i_dnet_in_ready), + .i_dnet_valid (i_dnet_valid), + .i_dnet_flit (i_dnet_flit), + .o_dnet_out_ready (o_dnet_out_ready), + .o_anet_valid (o_anet_valid), + .o_anet_dst (o_anet_dst), + .o_anet_flit (o_anet_flit), + .i_anet_in_ready (i_anet_in_ready), + .i_anet_valid (i_anet_valid), + .i_anet_flit (i_anet_flit), + .o_anet_out_ready (o_anet_out_ready) + ); + +endmodule diff --git a/verilog/hex.sv b/verilog/hex.sv index 44cc2d1..3c6e10c 100644 --- a/verilog/hex.sv +++ b/verilog/hex.sv @@ -30,7 +30,14 @@ module hex .o_d_data (req_d_data), .i_d_data (res_d_data), .o_syscall_valid (o_syscall_valid), - .o_syscall (o_syscall) + .o_syscall (o_syscall), + // No channels in the single-core top: never stall, no received word. + .o_op_out (), + .o_op_in (), + .o_chan_slot (), + .o_chan_areg (), + .i_liu_busy (1'b0), + .i_liu_in_word ('0) ); memory u_memory ( diff --git a/verilog/hex_pkg.sv b/verilog/hex_pkg.sv index fdb194f..64f91ba 100644 --- a/verilog/hex_pkg.sv +++ b/verilog/hex_pkg.sv @@ -12,6 +12,13 @@ package hex_pkg; localparam INSTR_WIDTH = INSTR_OPC_WIDTH + INSTR_OPR_WIDTH; localparam SYSCALL_OPC_WIDTH = 2; + // Message passing: each core has NUM_LINKS logical channels (slots), and the + // network connects up to NUM_CORES cores. + localparam NUM_LINKS = 4; + localparam NUM_CORES = 4; + localparam SLOT_W = $clog2(NUM_LINKS); // = 2 + localparam CID_W = $clog2(NUM_CORES); // = 2 + typedef logic [MEM_ADDR_WIDTH-1:0] iaddr_t; // Byte/instruction address typedef logic [MEM_ADDR_WIDTH-1:2] waddr_t; // Word address typedef logic [MEM_WIDTH-1:0] data_t; @@ -39,7 +46,9 @@ package hex_pkg; BRB = 0, ADD = 1, SUB = 2, - SVC = 3 + SVC = 3, + IN = 4, + OUT = 5 } opr_opcode_t; typedef enum logic [SYSCALL_OPC_WIDTH-1:0] { @@ -53,4 +62,22 @@ package hex_pkg; operand_t operand; } instr_t; + // Network addressing and flits. + typedef logic [CID_W-1:0] core_id_t; + typedef logic [SLOT_W-1:0] slot_t; + + // A data packet: one word to (dst_core, dst_slot), carrying its source core + // so the reader can address the acknowledgement back. + typedef struct packed { + core_id_t dst_core; + slot_t dst_slot; + core_id_t src_core; + data_t word; + } data_flit_t; + + // An acknowledgement packet: routed back to the waiting writer. + typedef struct packed { + core_id_t dst_core; + } ack_flit_t; + endpackage diff --git a/verilog/link_interface.sv b/verilog/link_interface.sv new file mode 100644 index 0000000..eba6146 --- /dev/null +++ b/verilog/link_interface.sv @@ -0,0 +1,121 @@ +module link_interface ( + input logic i_clk, + input logic i_rst, + input hex_pkg::core_id_t i_core_id, + // Processor side. + input logic i_op_out, + input logic i_op_in, + input hex_pkg::slot_t i_slot, + input hex_pkg::data_t i_areg, + output logic o_busy, + output logic o_done, + output hex_pkg::data_t o_in_word, + // Route table config (reset-time). + input logic i_cfg_we, + input hex_pkg::slot_t i_cfg_slot, + input hex_pkg::core_id_t i_cfg_dst_core, + input hex_pkg::slot_t i_cfg_dst_slot, + // DATA network. + output logic o_dnet_valid, + output hex_pkg::core_id_t o_dnet_dst, + output hex_pkg::data_flit_t o_dnet_flit, + input logic i_dnet_in_ready, + input logic i_dnet_valid, + input hex_pkg::data_flit_t i_dnet_flit, + output logic o_dnet_out_ready, + // ACK network. + output logic o_anet_valid, + output hex_pkg::core_id_t o_anet_dst, + output hex_pkg::ack_flit_t o_anet_flit, + input logic i_anet_in_ready, + input logic i_anet_valid, + input hex_pkg::ack_flit_t i_anet_flit, + output logic o_anet_out_ready + ); + + // Route table: slot -> (dst_core, dst_slot). + hex_pkg::core_id_t rt_core [hex_pkg::NUM_LINKS]; + hex_pkg::slot_t rt_slot [hex_pkg::NUM_LINKS]; + always_ff @(posedge i_clk) + if (i_cfg_we) begin + rt_core[i_cfg_slot] <= i_cfg_dst_core; + rt_slot[i_cfg_slot] <= i_cfg_dst_slot; + end + + // Per-slot receive buffers. + logic [hex_pkg::NUM_LINKS-1:0] rx_valid; + hex_pkg::core_id_t rx_src [hex_pkg::NUM_LINKS]; + hex_pkg::data_t rx_word [hex_pkg::NUM_LINKS]; + + // Accept a delivered DATA flit into its (provably empty) slot buffer. + // (i_dnet_flit.dst_core is not needed: flit is already routed to this core.) + assign o_dnet_out_ready = i_dnet_valid && !rx_valid[i_dnet_flit.dst_slot]; + // ACKs are always accepted (this core is the waiting writer). + // (i_anet_flit carries no payload; only i_anet_valid is inspected.) + assign o_anet_out_ready = 1'b1; + + typedef enum logic [1:0] { IDLE, OUT_SEND, OUT_WAIT, IN_ACK } state_t; + state_t st; + + // Combinational outputs. + always_comb begin + o_dnet_valid = 1'b0; + o_dnet_dst = '0; + o_dnet_flit = '0; + o_anet_valid = 1'b0; + o_anet_dst = '0; + o_anet_flit = '0; + o_done = 1'b0; + o_in_word = rx_word[i_slot]; + o_busy = (i_op_out || i_op_in); + unique case (st) + OUT_SEND: begin + o_dnet_valid = 1'b1; + o_dnet_dst = rt_core[i_slot]; + o_dnet_flit.dst_core = rt_core[i_slot]; + o_dnet_flit.dst_slot = rt_slot[i_slot]; + o_dnet_flit.src_core = i_core_id; + o_dnet_flit.word = i_areg; + end + OUT_WAIT: begin + if (i_anet_valid) o_done = 1'b1; // ACK for this core + end + IN_ACK: begin + o_anet_valid = 1'b1; + o_anet_dst = rx_src[i_slot]; + o_anet_flit.dst_core = rx_src[i_slot]; + if (i_anet_in_ready) o_done = 1'b1; + end + default: ; // IDLE + endcase + if (o_done) o_busy = 1'b0; + end + + // Sequential: FSM + receive buffers. + always_ff @(posedge i_clk or posedge i_rst) + if (i_rst) begin + st <= IDLE; + rx_valid <= '0; + end else begin + // Receive a delivered DATA flit. + if (i_dnet_valid && o_dnet_out_ready) begin + rx_valid[i_dnet_flit.dst_slot] <= 1'b1; + rx_src [i_dnet_flit.dst_slot] <= i_dnet_flit.src_core; + rx_word [i_dnet_flit.dst_slot] <= i_dnet_flit.word; + end + // FSM. + unique case (st) + IDLE: begin + if (i_op_out) st <= OUT_SEND; + else if (i_op_in && rx_valid[i_slot]) st <= IN_ACK; + end + OUT_SEND: if (i_dnet_in_ready) st <= OUT_WAIT; + OUT_WAIT: if (i_anet_valid) st <= IDLE; + IN_ACK: if (i_anet_in_ready) begin + rx_valid[i_slot] <= 1'b0; + st <= IDLE; + end + endcase + end + +endmodule diff --git a/verilog/network_top.sv b/verilog/network_top.sv new file mode 100644 index 0000000..3b6f832 --- /dev/null +++ b/verilog/network_top.sv @@ -0,0 +1,105 @@ +// A fixed network of NUM_CORES cores connected by a DATA router and an ACK +// router. The wiring is static; the per-core route tables (slot -> address) +// are programmed at reset via the config port, so any topology that fits is +// realised by configuration rather than re-elaboration. +// +// Core k injects into router input port k and is delivered from router output +// port k. A flit's dst_core selects the destination output port. +module network_top + ( + input logic i_clk, + input logic i_rst, + // Per-core syscall interface. + output logic [hex_pkg::NUM_CORES-1:0] o_syscall_valid, + output hex_pkg::syscall_t o_syscall [hex_pkg::NUM_CORES], + // Route-table config: write (dst_core,dst_slot) to core i_cfg_core slot + // i_cfg_slot when i_cfg_we is high. + input logic i_cfg_we, + input hex_pkg::core_id_t i_cfg_core, + input hex_pkg::slot_t i_cfg_slot, + input hex_pkg::core_id_t i_cfg_dst_core, + input hex_pkg::slot_t i_cfg_dst_slot + ); + + localparam int N = hex_pkg::NUM_CORES; + + // DATA network wiring (core k <-> router port k). + logic [N-1:0] dnet_in_valid; + hex_pkg::core_id_t dnet_in_dst [N]; + hex_pkg::data_flit_t dnet_in_flit [N]; + logic [N-1:0] dnet_in_ready; + logic [N-1:0] dnet_out_valid; + hex_pkg::data_flit_t dnet_out_flit [N]; + logic [N-1:0] dnet_out_ready; + + // ACK network wiring. + logic [N-1:0] anet_in_valid; + hex_pkg::core_id_t anet_in_dst [N]; + hex_pkg::ack_flit_t anet_in_flit [N]; + logic [N-1:0] anet_in_ready; + logic [N-1:0] anet_out_valid; + hex_pkg::ack_flit_t anet_out_flit [N]; + logic [N-1:0] anet_out_ready; + + // Per-core config write enable (decoded from i_cfg_core). + logic [N-1:0] cfg_we; + always_comb + for (int k = 0; k < N; k++) + cfg_we[k] = i_cfg_we && (i_cfg_core == k[hex_pkg::CID_W-1:0]); + + genvar k; + generate + for (k = 0; k < N; k++) begin : g_core + core u_core ( + .i_clk (i_clk), + .i_rst (i_rst), + .i_core_id (k[hex_pkg::CID_W-1:0]), + .o_syscall_valid (o_syscall_valid[k]), + .o_syscall (o_syscall[k]), + .i_cfg_we (cfg_we[k]), + .i_cfg_slot (i_cfg_slot), + .i_cfg_dst_core (i_cfg_dst_core), + .i_cfg_dst_slot (i_cfg_dst_slot), + .o_dnet_valid (dnet_in_valid[k]), + .o_dnet_dst (dnet_in_dst[k]), + .o_dnet_flit (dnet_in_flit[k]), + .i_dnet_in_ready (dnet_in_ready[k]), + .i_dnet_valid (dnet_out_valid[k]), + .i_dnet_flit (dnet_out_flit[k]), + .o_dnet_out_ready (dnet_out_ready[k]), + .o_anet_valid (anet_in_valid[k]), + .o_anet_dst (anet_in_dst[k]), + .o_anet_flit (anet_in_flit[k]), + .i_anet_in_ready (anet_in_ready[k]), + .i_anet_valid (anet_out_valid[k]), + .i_anet_flit (anet_out_flit[k]), + .o_anet_out_ready (anet_out_ready[k]) + ); + end + endgenerate + + router #(.FLIT_W($bits(hex_pkg::data_flit_t))) u_dnet ( + .i_clk (i_clk), + .i_rst (i_rst), + .i_in_valid (dnet_in_valid), + .i_in_dst (dnet_in_dst), + .i_in_flit (dnet_in_flit), + .o_in_ready (dnet_in_ready), + .o_out_valid (dnet_out_valid), + .o_out_flit (dnet_out_flit), + .i_out_ready (dnet_out_ready) + ); + + router #(.FLIT_W($bits(hex_pkg::ack_flit_t))) u_anet ( + .i_clk (i_clk), + .i_rst (i_rst), + .i_in_valid (anet_in_valid), + .i_in_dst (anet_in_dst), + .i_in_flit (anet_in_flit), + .o_in_ready (anet_in_ready), + .o_out_valid (anet_out_valid), + .o_out_flit (anet_out_flit), + .i_out_ready (anet_out_ready) + ); + +endmodule diff --git a/verilog/processor.sv b/verilog/processor.sv index 70d3247..95f8db4 100644 --- a/verilog/processor.sv +++ b/verilog/processor.sv @@ -14,7 +14,14 @@ module processor input hex_pkg::data_t i_d_data, // Syscall interface output logic o_syscall_valid, - output hex_pkg::syscall_t o_syscall + output hex_pkg::syscall_t o_syscall, + // Channel link interface (to the per-core link_interface unit). + output logic o_op_out, + output logic o_op_in, + output hex_pkg::slot_t o_chan_slot, + output hex_pkg::data_t o_chan_areg, + input logic i_liu_busy, + input hex_pkg::data_t i_liu_in_word ); // State @@ -26,6 +33,9 @@ module processor // Nets hex_pkg::instr_t instr /* verilator public */; logic instr_svc; + logic instr_in; + logic instr_out; + logic stall; hex_pkg::iaddr_t pc_d; hex_pkg::data_t areg_d; hex_pkg::data_t breg_d; @@ -41,7 +51,7 @@ module processor areg_q <= '0; breg_q <= '0; oreg_q <= '0; - end else begin + end else if (!stall) begin pc_q <= pc_d; areg_q <= areg_d; breg_q <= breg_d; @@ -55,6 +65,17 @@ module processor assign instr_opc = instr.opcode; assign instr_opr = instr.operand; assign instr_svc = instr.opcode == hex_pkg::OPR && instr.operand == hex_pkg::SVC; + assign instr_in = instr.opcode == hex_pkg::OPR && instr.operand == hex_pkg::IN; + assign instr_out = instr.opcode == hex_pkg::OPR && instr.operand == hex_pkg::OUT; + + // Channel ops hand off to the link interface; the channel index is in breg + // and the word to send is in areg. The processor stalls (freezes all state) + // until the link interface completes the rendezvous. + assign o_op_out = instr_out; + assign o_op_in = instr_in; + assign o_chan_slot = breg_q[hex_pkg::SLOT_W-1:0]; + assign o_chan_areg = areg_q; + assign stall = (instr_in || instr_out) && i_liu_busy; // Current operand (oreg) value. assign opr_d = oreg_q | {28'b0, instr.operand}; @@ -96,6 +117,7 @@ module processor unique case(instr.operand) hex_pkg::ADD: areg_d = {areg_q + breg_q}; hex_pkg::SUB: areg_d = {areg_q - breg_q}; + hex_pkg::IN: areg_d = i_liu_in_word; default:; endcase default:; diff --git a/verilog/router.sv b/verilog/router.sv new file mode 100644 index 0000000..5ab2c9d --- /dev/null +++ b/verilog/router.sv @@ -0,0 +1,86 @@ +module router #( + parameter int FLIT_W = 32 + ) ( + input logic i_clk, + input logic i_rst, + // Injection ports (from cores). + input logic [hex_pkg::NUM_CORES-1:0] i_in_valid, + input hex_pkg::core_id_t i_in_dst [hex_pkg::NUM_CORES], + input logic [FLIT_W-1:0] i_in_flit [hex_pkg::NUM_CORES], + output logic [hex_pkg::NUM_CORES-1:0] o_in_ready, + // Delivery ports (to cores). + output logic [hex_pkg::NUM_CORES-1:0] o_out_valid, + output logic [FLIT_W-1:0] o_out_flit [hex_pkg::NUM_CORES], + input logic [hex_pkg::NUM_CORES-1:0] i_out_ready + ); + + localparam int N = hex_pkg::NUM_CORES; + + // 1-deep input buffers. + logic [N-1:0] ibuf_valid; + hex_pkg::core_id_t ibuf_dst [N]; + logic [FLIT_W-1:0] ibuf_flit [N]; + + // Registered outputs. + logic [N-1:0] obuf_valid; + logic [FLIT_W-1:0] obuf_flit [N]; + + // Round-robin priority pointer per output. + hex_pkg::core_id_t rr [N]; + + // Accept a new injection only when the input buffer is empty. + always_comb + for (int i = 0; i < N; i++) + o_in_ready[i] = !ibuf_valid[i]; + + // Per-output round-robin grant among input buffers addressed to it. + logic [N-1:0] grant_vld; + hex_pkg::core_id_t grant_idx [N]; + always_comb begin + for (int j = 0; j < N; j++) begin + grant_vld[j] = 1'b0; + grant_idx[j] = '0; + for (int k = 0; k < N; k++) begin + automatic hex_pkg::core_id_t i = hex_pkg::core_id_t'((int'(rr[j]) + k) % N); + if (!grant_vld[j] && ibuf_valid[i] && (ibuf_dst[i] == j[hex_pkg::CID_W-1:0])) begin + grant_vld[j] = 1'b1; + grant_idx[j] = i; + end + end + end + end + + assign o_out_valid = obuf_valid; + always_comb + for (int j = 0; j < N; j++) + o_out_flit[j] = obuf_flit[j]; + + always_ff @(posedge i_clk or posedge i_rst) + if (i_rst) begin + ibuf_valid <= '0; + obuf_valid <= '0; + for (int i = 0; i < N; i++) rr[i] <= '0; + end else begin + // Load input buffers from injectors. + for (int i = 0; i < N; i++) + if (!ibuf_valid[i] && i_in_valid[i]) begin + ibuf_valid[i] <= 1'b1; + ibuf_dst[i] <= i_in_dst[i]; + ibuf_flit[i] <= i_in_flit[i]; + end + // Drain and (re)fill each output. A flit is granted to output j only when + // that output will be free this cycle, so a stalled output (i_out_ready + // low) holds its flit and back-pressures the input buffer (no flit loss). + for (int j = 0; j < N; j++) begin + if (obuf_valid[j] && i_out_ready[j]) + obuf_valid[j] <= 1'b0; + if ((!obuf_valid[j] || i_out_ready[j]) && grant_vld[j]) begin + obuf_valid[j] <= 1'b1; + obuf_flit[j] <= ibuf_flit[grant_idx[j]]; + ibuf_valid[grant_idx[j]] <= 1'b0; + rr[j] <= hex_pkg::core_id_t'((int'(grant_idx[j]) + 1) % N); + end + end + end + +endmodule