diff --git a/workloads/gromacs/README.md b/workloads/gromacs/README.md index 845249d..defeda8 100644 --- a/workloads/gromacs/README.md +++ b/workloads/gromacs/README.md @@ -48,6 +48,35 @@ Configure the shim layer behavior using these environment variables: - `CXL_SHIM_COPY_SEND`: Copy send buffers to CXL memory (set to 1) - `CXL_SHIM_COPY_RECV`: Use CXL memory for receive buffers (set to 1) +### Choosing a Cache Coherence Variant + +Each `.so` is compiled with a specific cache coherence strategy. Pick the one that matches your needs: + +| Library | Flush after write | Invalidate before read | +|---------|-------|------------| +| `libmpi_cxl_shim_nocc.so` | none | none | +| `libmpi_cxl_shim.so` (same with `libmpi_cxl_shim_nocc.so`) | none | none | +| `libmpi_cxl_shim_cc_clwb_clflush.so` | clwb | clflush | +| `libmpi_cxl_shim_cc_clwb_clflushopt.so` | clwb | clflushopt | +| `libmpi_cxl_shim_cc_clflush_clflush.so` | clflush | clflush | +| `libmpi_cxl_shim_cc_clflush_clflushopt.so` | clflush | clflushopt | +| `libmpi_cxl_shim_cc_clflushopt_clflush.so` | clflushopt | clflush | +| `libmpi_cxl_shim_cc_clflushopt_clflushopt.so` | clflushopt | clflushopt | + +Select a variant by setting `LD_PRELOAD` to the desired `.so` file: + +For example: +```bash +# No cache coherence (MPI_Barrier sync only) +export LD_PRELOAD=./libmpi_cxl_shim_nocc.so + +# Cache coherence with clwb + clflush +export LD_PRELOAD=./libmpi_cxl_shim_cc_clwb_clflush.so + +# Cache coherence with clflush + clflush +export LD_PRELOAD=./libmpi_cxl_shim_cc_clflush_clflush.so +``` + ### Running with GROMACS 1. **With DAX device (real CXL hardware):** diff --git a/workloads/gromacs/mpi_cxl_shim.c b/workloads/gromacs/mpi_cxl_shim.c index a1fae8c..25886a9 100644 --- a/workloads/gromacs/mpi_cxl_shim.c +++ b/workloads/gromacs/mpi_cxl_shim.c @@ -2470,6 +2470,27 @@ static void shim_init(void) { fprintf(stderr, "│ - Remotable pointers (offset-based addressing) │\n"); fprintf(stderr, "│ - Per-rank message queues in shared memory │\n"); fprintf(stderr, "│ - Inline small messages (<4KB) optimization │\n"); +#ifdef CXL_CACHE_COHERENCE + fprintf(stderr, "│ - Cache coherence: flush=%-4s invalidate=%-13s │\n", +#if defined(CXL_FLUSH_CLWB) + "clwb", +#elif defined(CXL_FLUSH_CLFLUSH) + "clflush", +#elif defined(CXL_FLUSH_CLFLUSHOPT) + "clflushopt", +#else + "?", +#endif +#if defined(CXL_INV_CLFLUSH) + "clflush"); +#elif defined(CXL_INV_CLFLUSHOPT) + "clflushopt"); +#else + "?"); +#endif +#else + fprintf(stderr, "│ - Cache coherence: disabled (MPI_Barrier sync) │\n"); +#endif fprintf(stderr, "├──────────────────────────────────────────────────────────┤\n"); fprintf(stderr, "│ CONFIGURATION: │\n"); fprintf(stderr, "│ CXL_DAX_PATH: %-40s │\n",